コード例 #1
0
  def __addPool( self, poolName ):
    # create a new thread Pool, by default it has 2 executing threads and 40 requests
    # in the Queue

    if not poolName:
      return None
    if poolName in self.pools:
      return None
    pool = ThreadPool( self.am_getOption( 'minThreadsInPool' ),
                       self.am_getOption( 'maxThreadsInPool' ),
                       self.am_getOption( 'totalThreadsInPool' ) )
    # Daemonize except "Default" pool
    if poolName != 'Default':
      pool.daemonize()
    self.pools[poolName] = pool
    return poolName
コード例 #2
0
    def __addPool(self, poolName):
        # create a new thread Pool, by default it has 2 executing threads and 40 requests
        # in the Queue

        if not poolName:
            return None
        if poolName in self.pools:
            return None
        pool = ThreadPool(self.am_getOption('minThreadsInPool'),
                          self.am_getOption('maxThreadsInPool'),
                          self.am_getOption('totalThreadsInPool'))
        # Daemonize except "Default" pool
        if poolName != 'Default':
            pool.daemonize()
        self.pools[poolName] = pool
        return poolName
コード例 #3
0
ファイル: TaskQueueDirector.py プロジェクト: yujikato/DIRAC
    def __addPool(self, poolName):
        """
      create a new thread Pool, by default it has 2 executing threads and 40 requests
      in the Queue
    """

        if not poolName:
            return None
        if poolName in self.pools:
            return None
        pool = ThreadPool(
            self.am_getOption("minThreadsInPool"),
            self.am_getOption("maxThreadsInPool"),
            self.am_getOption("totalThreadsInPool"),
        )
        # Daemonize except "Default" pool
        if poolName != "Default":
            pool.daemonize()
        self.pools[poolName] = pool
        return poolName
コード例 #4
0
ファイル: FTSAgent.py プロジェクト: DIRACGrid-test/DIRAC
class FTSAgent( AgentModule ):
  """
  .. class:: FTSAgent

  Agent propagating Scheduled request to Done or Failed state in the FTS system.

  Requests and associated FTSJobs (and so FTSFiles) are kept in cache.

  """
  # # fts placement refresh in seconds
  FTSPLACEMENT_REFRESH = FTSHistoryView.INTERVAL / 2
  # # placeholder for max job per channel
  MAX_ACTIVE_JOBS = 50
  # # min threads
  MIN_THREADS = 1
  # # max threads
  MAX_THREADS = 10
  # # files per job
  MAX_FILES_PER_JOB = 100
  # # MAX FTS transfer per FTSFile
  MAX_ATTEMPT = 256
  # # stage flag
  PIN_TIME = 0
  # # FTS submission command
  SUBMIT_COMMAND = 'glite-transfer-submit'
  # # FTS monitoring command
  MONITOR_COMMAND = 'glite-transfer-status'
  # Max number of requests fetched from the RMS
  MAX_REQUESTS = 100
  # Minimum interval (seconds) between 2 job monitoring
  MONITORING_INTERVAL = 600

  # # placeholder for FTS client
  __ftsClient = None
  # # placeholder for the FTS version
  __ftsVersion = None
  # # placeholder for request client
  __requestClient = None
  # # placeholder for resources helper
  __resources = None
  # # placeholder for RSS client
  __rssClient = None
  # # placeholder for FTSPlacement
  __ftsPlacement = None

  # # placement regeneration time delta
  __ftsPlacementValidStamp = None

  # # placeholder for threadPool
  __threadPool = None
  # # update lock
  __updateLock = None
  # # request cache
  __reqCache = dict()

  def updateLock( self ):
    """ update lock """
    if not self.__updateLock:
      self.__updateLock = LockRing().getLock( "FTSAgentLock" )
    return self.__updateLock

  @classmethod
  def requestClient( cls ):
    """ request client getter """
    if not cls.__requestClient:
      cls.__requestClient = ReqClient()
    return cls.__requestClient

  @classmethod
  def ftsClient( cls ):
    """ FTS client """
    if not cls.__ftsClient:
      cls.__ftsClient = FTSClient()
    return cls.__ftsClient



  @classmethod
  def rssClient( cls ):
    """ RSS client getter """
    if not cls.__rssClient:
      cls.__rssClient = ResourceStatus()
    return cls.__rssClient

  @classmethod
  def getRequest( cls, reqID ):
    """ get Requests systematically and refresh cache """
    getRequest = cls.requestClient().getRequest( reqID )
    if not getRequest["OK"]:
      cls.__reqCache.pop( reqID, None )
      return getRequest
    getRequest = getRequest["Value"]
    if not getRequest:
      cls.__reqCache.pop( reqID, None )
      return S_ERROR( "request of id '%s' not found in ReqDB" % reqID )
    cls.__reqCache[reqID] = getRequest

    return S_OK( cls.__reqCache[reqID] )

  @classmethod
  def putRequest( cls, request, clearCache = True ):
    """ put request back to ReqDB

    :param Request request: Request instance
    :param bool clearCache: clear the cache?

    also finalize request if status == Done
    """
    # # put back request
    if request.RequestID not in cls.__reqCache:
      return S_OK()
    put = cls.requestClient().putRequest( request )
    if not put["OK"]:
      return put
    # # finalize first if possible
    if request.Status == "Done" and request.JobID:
      finalizeRequest = cls.requestClient().finalizeRequest( request.RequestID, request.JobID )
      if not finalizeRequest["OK"]:
        request.Status = "Scheduled"
    # # del request from cache if needed
    if clearCache:
      cls.__reqCache.pop( request.RequestID, None )
    return S_OK()

  @classmethod
  def putFTSJobs( cls, ftsJobsList ):
    """ put back fts jobs to the FTSDB """
    for ftsJob in ftsJobsList:
      put = cls.ftsClient().putFTSJob( ftsJob )
      if not put["OK"]:
        return put
    return S_OK()

  @staticmethod
  def updateFTSFileDict( ftsFilesDict, toUpdateDict ):
    """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """
    for category, ftsFileList in ftsFilesDict.iteritems():
      for ftsFile in toUpdateDict.get( category, [] ):
        if ftsFile not in ftsFileList:
          ftsFileList.append( ftsFile )
    return ftsFilesDict

#  def resources( self ):
#    """ resource helper getter """
#    if not self.__resources:
#      self.__resources = Resources()
#    return self.__resources

  def threadPool( self ):
    """ thread pool getter """
    if not self.__threadPool:
      self.__threadPool = ThreadPool( self.MIN_THREADS, self.MAX_THREADS )
      self.__threadPool.daemonize()
    return self.__threadPool


  def resetFTSPlacement( self ):
    """ create fts Placement """

    ftsHistory = self.ftsClient().getFTSHistory()
    if not ftsHistory["OK"]:
      self.log.error( "unable to get FTS history:", ftsHistory["Message"] )
      return ftsHistory
    ftsHistory = ftsHistory["Value"]

    try:
      self.updateLock().acquire()
      if not self.__ftsPlacement:
        self.__ftsPlacement = FTSPlacement( csPath = None, ftsHistoryViews = ftsHistory )
      else:
        self.__ftsPlacement.refresh( ftsHistoryViews = ftsHistory )
    finally:
      self.updateLock().release()

    # # save time stamp
    self.__ftsPlacementValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH )

    return S_OK()

  def initialize( self ):
    """ agent's initialization """


      # # data manager
    self.dataManager = DataManager()

    log = self.log.getSubLogger( "initialize" )

    self.FTSPLACEMENT_REFRESH = self.am_getOption( "FTSPlacementValidityPeriod", self.FTSPLACEMENT_REFRESH )
    log.info( "FTSPlacement validity period       = %s s" % self.FTSPLACEMENT_REFRESH )


    self.SUBMIT_COMMAND = self.am_getOption( "SubmitCommand", self.SUBMIT_COMMAND )
    log.info( "FTS submit command = %s" % self.SUBMIT_COMMAND )
    self.MONITOR_COMMAND = self.am_getOption( "MonitorCommand", self.MONITOR_COMMAND )
    log.info( "FTS commands: submit = %s monitor %s" % ( self.SUBMIT_COMMAND, self.MONITOR_COMMAND ) )
    self.PIN_TIME = self.am_getOption( "PinTime", self.PIN_TIME )
    log.info( "Stage files before submission  = ", {True: "yes", False: "no"}[bool( self.PIN_TIME )] )

    self.MAX_ACTIVE_JOBS = self.am_getOption( "MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS )
    log.info( "Max active FTSJobs/route       = ", str( self.MAX_ACTIVE_JOBS ) )
    self.MAX_FILES_PER_JOB = self.am_getOption( "MaxFilesPerJob", self.MAX_FILES_PER_JOB )
    log.info( "Max FTSFiles/FTSJob            = ", str( self.MAX_FILES_PER_JOB ) )

    self.MAX_ATTEMPT = self.am_getOption( "MaxTransferAttempts", self.MAX_ATTEMPT )
    log.info( "Max transfer attempts          = ", str( self.MAX_ATTEMPT ) )

    # # thread pool
    self.MIN_THREADS = self.am_getOption( "MinThreads", self.MIN_THREADS )
    self.MAX_THREADS = self.am_getOption( "MaxThreads", self.MAX_THREADS )
    minmax = ( abs( self.MIN_THREADS ), abs( self.MAX_THREADS ) )
    self.MIN_THREADS, self.MAX_THREADS = min( minmax ), max( minmax )
    log.info( "ThreadPool min threads         = ", str( self.MIN_THREADS ) )
    log.info( "ThreadPool max threads         = ", str( self.MAX_THREADS ) )

    self.MAX_REQUESTS = self.am_getOption( "MaxRequests", self.MAX_REQUESTS )
    log.info( "Max Requests fetched           = ", str( self.MAX_REQUESTS ) )

    self.MONITORING_INTERVAL = self.am_getOption( "MonitoringInterval", self.MONITORING_INTERVAL )
    log.info( "Minimum monitoring interval    = ", str( self.MONITORING_INTERVAL ) )

    self.__ftsVersion = Operations().getValue( 'DataManagement/FTSVersion', 'FTS2' )
    log.info( "FTSVersion : %s" % self.__ftsVersion )
    log.info( "initialize: creation of FTSPlacement..." )
    createPlacement = self.resetFTSPlacement()
    if not createPlacement["OK"]:
      log.error( "initialize:", createPlacement["Message"] )
      return createPlacement

    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )
    log.info( "will use DataManager proxy" )

    self.registrationProtocols = getRegistrationProtocols()


    # # gMonitor stuff here
    gMonitor.registerActivity( "RequestsAtt", "Attempted requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RequestsOK", "Successful requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RequestsFail", "Failed requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSJobsSubAtt", "FTSJobs creation attempts",
                               "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsSubOK", "FTSJobs submitted successfully",
                               "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsSubFail", "FTSJobs submissions failed",
                               "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSJobsMonAtt", "FTSJobs monitored",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsMonOK", "FTSJobs monitored successfully",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsMonFail", "FTSJobs attempts failed",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSMonitorFail", "Failed FTS monitor executions",
                               "FTSAgent", "Execution/mins", gMonitor.OP_SUM )


    pollingTime = self.am_getOption( "PollingTime", 60 )
    for status in list( FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES ):
      gMonitor.registerActivity( "FTSJobs%s" % status, "FTSJobs %s" % status ,
                                 "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime )

    gMonitor.registerActivity( "FtSJobsPerRequest", "Average FTSJobs per request",
                               "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN )
    gMonitor.registerActivity( "FTSFilesPerJob", "FTSFiles per FTSJob",
                               "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN )
    gMonitor.registerActivity( "FTSSizePerJob", "Average FTSFiles size per FTSJob",
                               "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN )
    return S_OK()

  def finalize( self ):
    """ finalize processing """
    # log = self.log.getSubLogger( "finalize" )
    # if self.__reqCache:
    #  log.info( 'putting back %d requests from cache' % len( self.__reqCache ) )
    # else:
    #  log.info( 'no requests to put back' )
    # for request in self.__reqCache.values():
    #  put = self.requestClient().putRequest( request )
    #  if not put["OK"]:
    #    log.error( "unable to put back request '%s': %s" % ( request.RequestName, put["Message"] ) )
    return S_OK()

  def execute( self ):
    """ one cycle execution """

    # Don't use the server certificate otherwise the DFC wont let us write
    gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' )


    log = gLogger.getSubLogger( "execute" )
    # # reset FTSPlacement if expired
    now = datetime.datetime.now()
    if now > self.__ftsPlacementValidStamp:
      log.info( "resetting expired FTS placement..." )
      resetFTSPlacement = self.resetFTSPlacement()
      if not resetFTSPlacement["OK"]:
        log.error( "FTSPlacement recreation error:" , resetFTSPlacement["Message"] )
        return resetFTSPlacement
      self.__ftsPlacementValidStamp = now + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH )

    requestIDs = self.requestClient().getRequestIDsList( statusList = [ "Scheduled" ], limit = self.MAX_REQUESTS )
    if not requestIDs["OK"]:
      log.error( "unable to read scheduled request ids" , requestIDs["Message"] )
      return requestIDs
    if not requestIDs["Value"]:
      requestIDs = []
    else:
      requestIDs = [ req[0] for req in requestIDs["Value"] if req[0] not in self.__reqCache ]
    requestIDs += self.__reqCache.keys()

    if not requestIDs:
      log.info( "no 'Scheduled' requests to process" )
      return S_OK()

    log.info( "found %s requests to process:" % len( requestIDs ) )
    log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) )
    log.info( " =>   new read from RMS: %s" % ( len( requestIDs ) - len( self.__reqCache ) ) )

    for requestID in requestIDs:
      request = self.getRequest( requestID )
      if not request["OK"]:
        log.error( "Error getting request", "%s: %s" % ( requestID, request["Message"] ) )
        continue
      request = request["Value"]
      sTJId = request.RequestID
      while True:
        queue = self.threadPool().generateJobAndQueueIt( self.processRequest,
                                                         args = ( request, ),
                                                         sTJId = sTJId )
        if queue["OK"]:
          log.info( "Request enqueued for execution", sTJId )
          gMonitor.addMark( "RequestsAtt", 1 )
          break
        time.sleep( 1 )

    # # process all results
    self.threadPool().processAllResults()
    return S_OK()

  def processRequest( self, request ):
    """ process one request

    :param Request request: ReqDB.Request
    """
    log = self.log.getSubLogger( "req_%s/%s" % ( request.RequestID, request.RequestName ) )

    operation = request.getWaiting()
    if not operation["OK"]:
      log.error( "Unable to find 'Scheduled' ReplicateAndRegister operation in request" )
      return self.putRequest( request )
    operation = operation["Value"]
    if not isinstance( operation, Operation ):
      log.error( "Waiting returned operation is not an operation:", type( operation ) )
      return self.putRequest( request )
    if operation.Type != "ReplicateAndRegister":
      log.error( "operation to be executed is not a ReplicateAndRegister but", operation.Type )
      return self.putRequest( request )
    if operation.Status != "Scheduled":
      log.error( "operation in a wrong state, expecting 'Scheduled', got", operation.Status )
      return self.putRequest( request )

    log.info( 'start processRequest' )
    # # select  FTSJobs, by default all in TRANS_STATES and INIT_STATES
    ftsJobs = self.ftsClient().getFTSJobsForRequest( request.RequestID )
    if not ftsJobs["OK"]:
      log.error( ftsJobs["Message"] )
      return ftsJobs
    ftsJobs = [ftsJob for ftsJob in ftsJobs.get( "Value", [] ) if ftsJob.Status not in FTSJob.FINALSTATES]

    # # Use a try: finally: for making sure FTS jobs are put back before returning
    try:
      # # dict keeping info about files to reschedule, submit, fail and register
      ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) )

      now = datetime.datetime.utcnow()
      jobsToMonitor = [job for job in ftsJobs if
                       ( now - job.LastUpdate ).seconds >
                       ( self.MONITORING_INTERVAL * ( 3. if StorageElement( job.SourceSE ).getStatus().get( 'Value', {} ).get( 'TapeSE' ) else 1. ) )
                       ]
      if jobsToMonitor:
        log.info( "==> found %s FTSJobs to monitor" % len( jobsToMonitor ) )
        # # PHASE 0 = monitor active FTSJobs
        for ftsJob in jobsToMonitor:
          monitor = self.__monitorJob( request, ftsJob )
          if not monitor["OK"]:
            log.error( "unable to monitor FTSJob", "%s: %s" % ( ftsJob.FTSJobID, monitor["Message"] ) )
            ftsJob.Status = "Submitted"
          else:
            ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, monitor["Value"] )

        log.info( "monitoring of FTSJobs completed" )
        for key, ftsFiles in ftsFilesDict.iteritems():
          if ftsFiles:
            log.info( " => %d FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) )
      if len( ftsJobs ) != len( jobsToMonitor ):
        log.info( "==> found %d FTSJobs that were monitored recently" % ( len( ftsJobs ) - len( jobsToMonitor ) ) )
        if not jobsToMonitor:
          # Nothing to happen this time, escape
          raise EscapeTryException

      # # PHASE ONE - check ready replicas
      missingReplicas = self.__checkReadyReplicas( request, operation )
      if not missingReplicas["OK"]:
        log.error( missingReplicas["Message"] )
      else:
        missingReplicas = missingReplicas["Value"]
        for opFile in operation:
          # Actually the condition below should never happen... Change printout for checking
          if opFile.LFN not in missingReplicas and opFile.Status not in ( 'Done', 'Failed' ):
            log.warn( "File should be set Done! %s is replicated at all targets" % opFile.LFN )
            opFile.Status = "Done"

        if missingReplicas:
          # Check if these files are in the FTSDB
          ftsFiles = self.ftsClient().getAllFTSFilesForRequest( request.RequestID )
          if not ftsFiles['OK']:
            log.error( ftsFiles['Message'] )
          else:
            ftsFiles = ftsFiles['Value']
            ftsLfns = set( [ftsFile.LFN for ftsFile in ftsFiles] )
            # Recover files not in FTSDB
            toSchedule = set( missingReplicas ) - ftsLfns
            if toSchedule:
              log.warn( '%d files in operation are not in FTSDB, reset them Waiting' % len( toSchedule ) )
              for opFile in operation:
                if opFile.LFN in toSchedule and opFile.Status == 'Scheduled':
                  opFile.Status = 'Waiting'
            # Recover files with target not in FTSDB
            toSchedule = set( [missing for missing, missingSEs in missingReplicas.iteritems()
                              if not [ftsFile for ftsFile in ftsFiles
                                      if ftsFile.LFN == missing and ftsFile.TargetSE in missingSEs]] )
            if toSchedule:
              log.warn( '%d targets in operation are not in FTSDB, reset files Waiting' % len( toSchedule ) )
              for opFile in operation:
                if opFile.LFN in toSchedule and opFile.Status == 'Scheduled':
                  opFile.Status = 'Waiting'
            # identify missing LFNs that are waiting for a replication which is finished
            for ftsFile in [f for f in ftsFiles if f.LFN in missingReplicas and f.Status.startswith( 'Waiting#' )]:
              targetSE = ftsFile.Status.split( '#' )[1]
              finishedFiles = [f for f in ftsFiles if
                               f.LFN == ftsFile.LFN and
                               f.Status == 'Finished' and
                               f.TargetSE == targetSE and
                               f not in ftsFilesDict['toUpdate']]
              if finishedFiles:
                log.warn( "%s is %s while replication was Finished to %s, update" % ( ftsFile.LFN, ftsFile.Status, targetSE ) )
                ftsFilesDict['toUpdate'] += finishedFiles
            # identify Active transfers for which there is no FTS job any longer and reschedule them
            for ftsFile in [f for f in ftsFiles if f.Status == 'Active' and f.TargetSE in missingReplicas.get( f.LFN, [] )]:
              if not [ftsJob for ftsJob in ftsJobs if ftsJob.FTSGUID == ftsFile.FTSGUID]:
                ftsFilesDict['toReschedule'].append( ftsFile )
            # identify Finished transfer for which the replica is still missing
            for ftsFile in [f for f in ftsFiles if f.Status == 'Finished' and f.TargetSE in missingReplicas.get( f.LFN, [] ) and f not in ftsFilesDict['toRegister'] ]:
              # Check if there is a registration operation for that file and that target
              regOp = [op for op in request if
                       op.Type == 'RegisterReplica' and
                       op.TargetSE == ftsFile.TargetSE and
                       [f for f in op if f.LFN == ftsFile.LFN]]
              if not regOp:
                ftsFilesDict['toReschedule'].append( ftsFile )

            # Recover files that are Failed but were not spotted
            for ftsFile in [f for f in ftsFiles if f.Status == 'Failed' and f.TargetSE in missingReplicas.get( f.LFN, [] )]:
              reschedule, submit, fail = self.__checkFailed( ftsFile )
              if fail and ftsFile not in ftsFilesDict['toFail']:
                ftsFilesDict['toFail'].append( ftsFile )
              elif reschedule and ftsFile not in ftsFilesDict['toReschedule']:
                ftsFilesDict['toReschedule'].append( ftsFile )
              elif submit and ftsFile not in ftsFilesDict['toSubmit']:
                ftsFilesDict['toSubmit'].append( ftsFile )

            # If all transfers are finished for unregistered files and there is already a registration operation, set it Done
            ftsLFNs = [f.LFN for f in ftsFiles]
            for lfn in missingReplicas:
              # We make sure here that the file is being processed by FTS
              if lfn in ftsLFNs:
                if not [f for f in ftsFiles if f.LFN == lfn and ( f.Status != 'Finished' or f in ftsFilesDict['toReschedule'] or f in ftsFilesDict['toRegister'] )]:
                  for opFile in operation:
                    if opFile.LFN == lfn:
                      opFile.Status = 'Done'
                      break
              else:
                # Temporary log
                log.warn( "File with missing replica not in FTS files", lfn )
          for key, ftsFiles in ftsFilesDict.iteritems():
            if ftsFiles:
              log.info( " => %d FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) )

      toFail = ftsFilesDict.get( "toFail", [] )
      toReschedule = ftsFilesDict.get( "toReschedule", [] )
      toSubmit = ftsFilesDict.get( "toSubmit", [] )
      toRegister = ftsFilesDict.get( "toRegister", [] )
      toUpdate = ftsFilesDict.get( "toUpdate", [] )

      # # PHASE TWO = Failed files? -> make request Failed and return
      if toFail:
        log.error( "==> found %d 'Failed' FTSFiles, but maybe other files can be processed..." % len( toFail ) )
        for opFile in operation:
          for ftsFile in toFail:
            if opFile.FileID == ftsFile.FileID:
              opFile.Error = ftsFile.Error
              opFile.Status = "Failed"
        operation.Error = "%s files are missing any replicas" % len( toFail )
        # # requets.Status should be Failed if all files in the operation "Failed"
        if request.Status == "Failed":
          request.Error = "ReplicateAndRegister %s failed" % operation.Order
          log.error( "request is set to 'Failed'" )
          # # putRequest is done by the finally: clause... Not good to do it twice
          raise EscapeTryException

      # # PHASE THREE - update Waiting#TargetSE FTSFiles
      if toUpdate:
        log.info( "==> found %s possible FTSFiles to update..." % ( len( toUpdate ) ) )
        byTarget = {}
        for ftsFile in toUpdate:
          byTarget.setdefault( ftsFile.TargetSE, [] ).append( ftsFile.FileID )
        for targetSE, fileIDList in byTarget.iteritems():
          update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList )
          if not update["OK"]:
            log.error( "update FTSFiles failed:", update["Message"] )

      # # PHASE FOUR - add 'RegisterReplica' Operations
      if toRegister:
        log.info( "==> found %d Files waiting for registration, adding 'RegisterReplica' operations" % len( toRegister ) )
        registerFiles = self.__insertRegisterOperation( request, operation, toRegister )
        if not registerFiles["OK"]:
          log.error( "unable to create 'RegisterReplica' operations:", registerFiles["Message"] )
        # if request.Status == "Waiting":
        #  log.info( "request is in 'Waiting' state, will put it back to RMS" )
        #  return self.putRequest( request )

      # # PHASE FIVE - reschedule operation files
      if toReschedule:
        log.info( "==> found %s Files to reschedule" % len( toReschedule ) )
        rescheduleFiles = self.__reschedule( request, operation, toReschedule )
        if not rescheduleFiles["OK"]:
          log.error( 'Failed to reschedule files', rescheduleFiles["Message"] )

      # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs. We get also Failed files to recover them if needed
      ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, [ "Waiting", "Failed", 'Submitted', 'Canceled' ] )
      if not ftsFiles["OK"]:
        log.error( ftsFiles["Message"] )
      else:
        retryIds = set ( [ ftsFile.FTSFileID for ftsFile in toSubmit ] )
        for ftsFile in ftsFiles["Value"]:
          if ftsFile.FTSFileID not in retryIds:
            if ftsFile.Status in ( 'Failed', 'Canceled' ):
              # If the file was not unrecoverable failed and is not yet set toSubmit
              _reschedule, submit, _fail = self.__checkFailed( ftsFile )
            elif ftsFile.Status == 'Submitted':
              if ftsFile.FTSGUID not in [job.FTSGUID for job in ftsJobs]:
                log.warn( 'FTS GUID %s not found in FTS jobs, resubmit file transfer' % ftsFile.FTSGUID )
                ftsFile.Status = 'Waiting'
                submit = True
              else:
                submit = False
            else:
              submit = True
            if submit:
              toSubmit.append( ftsFile )
              retryIds.add( ftsFile.FTSFileID )

      # # should not put back jobs that have not been monitored this time
      ftsJobs = jobsToMonitor
      # # submit new ftsJobs
      if toSubmit:
        if request.Status != 'Scheduled':
          log.info( "Found %d FTSFiles to submit while request is no longer in Scheduled status (%s)" \
                    % ( len( toSubmit ), request.Status ) )
        else:
          self.__checkDuplicates( request.RequestID, toSubmit )
          log.info( "==> found %s FTSFiles to submit" % len( toSubmit ) )
          submit = self.__submit( request, operation, toSubmit )
          if not submit["OK"]:
            log.error( submit["Message"] )
          else:
            ftsJobs += submit["Value"]

      # # status change? - put back request
      if request.Status != "Scheduled":
        log.info( "request no longer in 'Scheduled' state (%s), will put it back to RMS" % request.Status )

    except EscapeTryException:
      # This clause is raised when one wants to return from within the try: clause
      # only put back jobs that were monitored
      ftsJobs = jobsToMonitor
    except Exception as exceptMessage:
      log.exception( "Exception in processRequest", lException = exceptMessage )
    finally:
      putRequest = self.putRequest( request, clearCache = ( request.Status != "Scheduled" ) )
      if not putRequest["OK"]:
        log.error( "unable to put back request:", putRequest["Message"] )
      # #  put back jobs in all cases
      if ftsJobs:
        for ftsJob in list( ftsJobs ):
          if not len( ftsJob ):
            log.warn( 'FTS job empty, removed: %s' % ftsJob.FTSGUID )
            self.ftsClient().deleteFTSJob( ftsJob.FTSJobID )
            ftsJobs.remove( ftsJob )
        putJobs = self.putFTSJobs( ftsJobs )
        if not putJobs["OK"]:
          log.error( "unable to put back FTSJobs:", putJobs["Message"] )
          putRequest = putJobs
    # This is where one returns from after execution of the finally: block
    return putRequest

  def __checkDuplicates( self, reqID, toSubmit ):
    """ Check in a list of FTSFiles whether there are duplicates
    """
    tupleList = []
    log = self.log.getSubLogger( "%s/checkDuplicates" % reqID )
    for ftsFile in list( toSubmit ):
      fTuple = ( ftsFile.LFN, ftsFile.SourceSE, ftsFile.TargetSE )
      if fTuple in tupleList:
        log.warn( "Duplicate file to submit, removed:", ', '.join( fTuple ) )
        toSubmit.remove( ftsFile )
        self.ftsClient().deleteFTSFiles( ftsFile.OperationID, [ftsFile.FileID] )
      else:
        tupleList.append( fTuple )


  def __reschedule( self, request, operation, toReschedule ):
    """ reschedule list of :toReschedule: files in request for operation :operation:

    :param Request request:
    :param Operation operation:
    :param list toReschedule: list of FTSFiles
    """
    log = self.log.getSubLogger( "req_%s/%s/reschedule" % ( request.RequestID, request.RequestName ) )

    ftsFileIDs = [ftsFile.FileID for ftsFile in toReschedule]
    for opFile in operation:
      if opFile.FileID in ftsFileIDs:
        opFile.Status = "Waiting"

    toSchedule = []

    # # filter files
    for opFile in [ opFile for opFile in operation if opFile.Status == "Waiting" ]:

      replicas = self.__filterReplicas( opFile )
      if not replicas["OK"]:
        continue
      replicas = replicas["Value"]
      validReplicas = replicas["Valid"]
      noMetaReplicas = replicas["NoMetadata"]
      noReplicas = replicas["NoReplicas"]
      badReplicas = replicas['Bad']

      if validReplicas:
        validTargets = list( set( operation.targetSEList ) - set( validReplicas ) )
        if not validTargets:
          log.info( "file %s is already present at all targets" % opFile.LFN )
          opFile.Status = "Done"
        else:
          toSchedule.append( ( opFile.toJSON()["Value"], validReplicas, validTargets ) )
      elif noMetaReplicas:
        log.warn( "unable to schedule '%s', couldn't get metadata at %s" % ( opFile.LFN, ','.join( noMetaReplicas ) ) )
      elif noReplicas:
        log.warn( "unable to schedule %s, file doesn't exist at %s" % ( opFile.LFN, ','.join( noReplicas ) ) )
        opFile.Status = 'Failed'
      elif badReplicas:
        log.warn( "unable to schedule %s, all replicas have a bad checksum at %s" % ( opFile.LFN, ','.join( badReplicas ) ) )
        opFile.Status = 'Failed'

    # # do real schedule here
    if toSchedule:
      log.info( "Rescheduling %d files" % len( toReschedule ) )
      ftsSchedule = self.ftsClient().ftsSchedule( request.RequestID,
                                                  operation.OperationID,
                                                  toSchedule )
      if not ftsSchedule["OK"]:
        log.error( "Error scheduling files", ftsSchedule["Message"] )
        return ftsSchedule

      ftsSchedule = ftsSchedule["Value"]
      for opFile in operation:
        fileID = opFile.FileID
        if fileID in ftsSchedule["Successful"]:
          opFile.Status = "Scheduled"
        elif fileID in ftsSchedule["Failed"]:
          opFile.Error = ftsSchedule["Failed"][fileID]
          log.error( "Error scheduling file %s" % opFile.LFN, opFile.Error )

    return S_OK()

  def __submit( self, request, operation, toSubmit ):
    """ create and submit new FTSJobs using list of FTSFiles

    :param Request request: ReqDB.Request instance
    :param list ftsFiles: list of FTSFile instances

    :return: [ FTSJob, FTSJob, ...]
    """
    log = self.log.getSubLogger( "req_%s/%s/submit" % ( request.RequestID, request.RequestName ) )

    bySourceAndTarget = {}
    for ftsFile in toSubmit:
      if ftsFile.SourceSE not in bySourceAndTarget:
        bySourceAndTarget.setdefault( ftsFile.SourceSE, {} )
      if ftsFile.TargetSE not in bySourceAndTarget[ftsFile.SourceSE]:
        bySourceAndTarget[ftsFile.SourceSE].setdefault( ftsFile.TargetSE, [] )
      bySourceAndTarget[ftsFile.SourceSE][ftsFile.TargetSE].append( ftsFile )

    ftsJobs = []

    for source, targetDict in bySourceAndTarget.iteritems():

      for target, ftsFileList in targetDict.iteritems():

        log.info( "found %s files to submit from %s to %s" % ( len( ftsFileList ), source, target ) )

        route = self.__ftsPlacement.findRoute( source, target )
        if not route["OK"]:
          log.error( route["Message"] )
          continue
        route = route["Value"]

        routeValid = self.__ftsPlacement.isRouteValid( route )

        if not routeValid['OK']:
          log.error( "Route invalid : %s" % routeValid['Message'] )
          continue

        sourceSE = StorageElement( source )
        sourceToken = sourceSE.getStorageParameters( protocol = 'srm' )
        if not sourceToken["OK"]:
          log.error( "unable to get sourceSE parameters:", "(%s) %s" % ( source, sourceToken["Message"] ) )
          continue
        seStatus = sourceSE.getStatus()['Value']

        targetSE = StorageElement( target )
        targetToken = targetSE.getStorageParameters( protocol = 'srm' )
        if not targetToken["OK"]:
          log.error( "unable to get targetSE parameters:", "(%s) %s" % ( target, targetToken["Message"] ) )
          continue

        # # create FTSJob
        for fileList in breakListIntoChunks( ftsFileList, self.MAX_FILES_PER_JOB ):
          ftsJob = FTSJob()
          ftsJob.RequestID = request.RequestID
          ftsJob.OperationID = operation.OperationID
          ftsJob.SourceSE = source
          ftsJob.TargetSE = target
          ftsJob.SourceToken = sourceToken["Value"].get( "SpaceToken", "" )
          ftsJob.TargetToken = targetToken["Value"].get( "SpaceToken", "" )
          ftsJob.FTSServer = route.ftsServer

          for ftsFile in fileList:
            ftsFile.Attempt += 1
            ftsFile.Error = ""
            ftsJob.addFile( ftsFile )

          submit = ftsJob.submitFTS( self.__ftsVersion, command = self.SUBMIT_COMMAND, pinTime = self.PIN_TIME if seStatus['TapeSE'] else 0 )
          if not submit["OK"]:
            log.error( "unable to submit FTSJob:", submit["Message"] )
            continue

          log.info( "FTSJob '%s'@'%s' has been submitted" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) )

          # # update statuses for job files
          for ftsFile in ftsJob:
            ftsFile.FTSGUID = ftsJob.FTSGUID
            ftsFile.Status = "Submitted"
            ftsFile.Attempt += 1



          # # update placement route
          try:
            self.updateLock().acquire()
            self.__ftsPlacement.startTransferOnRoute( route )
          finally:
            self.updateLock().release()

          ftsJobs.append( ftsJob )

    log.info( "%s new FTSJobs have been submitted" % len( ftsJobs ) )
    return S_OK( ftsJobs )

  def __monitorJob( self, request, ftsJob ):
    """ execute FTSJob.monitorFTS for a given :ftsJob:
        if ftsJob is in a final state, finalize it

    :param Request request: ReqDB.Request instance
    :param FTSJob ftsJob: FTSDB.FTSJob instance
    """
    log = self.log.getSubLogger( "req_%s/%s/monitor/%s" % ( request.RequestID, request.RequestName, ftsJob.FTSGUID ) )
    log.info( "FTSJob '%s'@'%s'" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) )

    # # this will be returned
    ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) )

    monitor = ftsJob.monitorFTS( self.__ftsVersion , command = self.MONITOR_COMMAND )
    if not monitor["OK"]:
      gMonitor.addMark( "FTSMonitorFail", 1 )
      log.error( monitor["Message"] )
      if "getTransferJobSummary2: Not authorised to query request" in monitor["Message"] or \
         'was not found' in monitor['Message'] or\
         "Not found" in monitor['Message'] or\
         'Unknown transfer state' in monitor['Message']:
        log.error( "FTSJob not known (expired on server?): delete it" )
        for ftsFile in ftsJob:
          ftsFile.Status = "Waiting"
          ftsFilesDict["toSubmit"].append( ftsFile )
        # #  No way further for that job: delete it
        res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID )
        if not res['OK']:
          log.error( "Unable to delete FTSJob", res['Message'] )
        return S_OK( ftsFilesDict )
      return monitor

    monitor = monitor["Value"]
    log.info( "FTSJob Status = %s Completeness = %s%%" % ( ftsJob.Status, ftsJob.Completeness ) )

    # # monitor status change
    gMonitor.addMark( "FTSJobs%s" % ftsJob.Status, 1 )

    if ftsJob.Status in FTSJob.FINALSTATES:
      finalizeFTSJob = self.__finalizeFTSJob( request, ftsJob )
      if not finalizeFTSJob["OK"]:
        if 'Unknown transfer state' in finalizeFTSJob['Message']:
          for ftsFile in ftsJob:
            ftsFile.Status = "Waiting"
            ftsFilesDict["toSubmit"].append( ftsFile )
          # #  No way further for that job: delete it
          res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID )
          if not res['OK']:
            log.error( "Unable to delete FTSJob", res['Message'] )
        else:
          log.error( finalizeFTSJob["Message"] )
          return finalizeFTSJob
      else:
        ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, finalizeFTSJob["Value"] )

    return S_OK( ftsFilesDict )

  def __finalizeFTSJob( self, request, ftsJob ):
    """ finalize FTSJob

    :param Request request: ReqDB.Request instance
    :param FTSJob ftsJob: FTSDB.FTSJob instance
    """
    log = self.log.getSubLogger( "req_%s/%s/monitor/%s/finalize" % ( request.RequestID,
                                                                     request.RequestName,
                                                                     ftsJob.FTSJobID ) )
    log.info( "finalizing FTSJob %s@%s" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) )

    # # this will be returned
    ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) )


    monitor = ftsJob.monitorFTS( self.__ftsVersion, command = self.MONITOR_COMMAND, full = True )
    if not monitor["OK"]:
      log.error( monitor["Message"] )
      return monitor

    # # split FTSFiles to different categories
    processFiles = self.__filterFiles( ftsJob )
    if not processFiles["OK"]:
      log.error( processFiles["Message"] )
      return processFiles
    processFiles = processFiles['Value']
    if processFiles['toRegister']:
      log.error( "Some files could not be registered in FC:", len( processFiles['toRegister'] ) )
    ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, processFiles )

    # # send accounting record for this job
    self.__sendAccounting( ftsJob, request.OwnerDN )

    # # update placement - remove this job from placement
    route = self.__ftsPlacement.findRoute( ftsJob.SourceSE, ftsJob.TargetSE )
    if route["OK"]:
      try:
        self.updateLock().acquire()
        self.__ftsPlacement.finishTransferOnRoute( route['Value'] )
      finally:
        self.updateLock().release()

    log.info( "FTSJob is finalized" )

    return S_OK( ftsFilesDict )

  def __checkFailed( self, ftsFile ):
    reschedule = False
    submit = False
    fail = False
    if ftsFile.Status in ( "Failed", 'Canceled' ):
      if ftsFile.Error == "MissingSource":
        reschedule = True
      else:
        if ftsFile.Attempt < self.MAX_ATTEMPT:
          submit = True
        else:
          fail = True
    return reschedule, submit, fail

  def __filterFiles( self, ftsJob ):
    """ process ftsFiles from finished ftsJob

    :param FTSJob ftsJob: monitored FTSJob instance
    """
    # # lists for different categories
    toUpdate = []
    toReschedule = []
    toRegister = []
    toSubmit = []
    toFail = []

    # # loop over files in fts job
    for ftsFile in ftsJob:
      # # successful files
      if ftsFile.Status == "Finished":
        if ftsFile.Error == "AddCatalogReplicaFailed":
          toRegister.append( ftsFile )
        toUpdate.append( ftsFile )
        continue
      reschedule, submit, fail = self.__checkFailed( ftsFile )
      if reschedule:
        toReschedule.append( ftsFile )
      elif submit:
        toSubmit.append( ftsFile )
      elif fail:
        toFail.append( ftsFile )

    return S_OK( { "toUpdate": toUpdate,
                   "toSubmit": toSubmit,
                   "toRegister": toRegister,
                   "toReschedule": toReschedule,
                   "toFail": toFail } )

  def __insertRegisterOperation( self, request, operation, toRegister ):
    """ add RegisterReplica operation

    :param Request request: request instance
    :param Operation transferOp: 'ReplicateAndRegister' operation for this FTSJob
    :param list toRegister: [ FTSDB.FTSFile, ... ] - files that failed to register
    """
    log = self.log.getSubLogger( "req_%s/%s/registerFiles" % ( request.RequestID, request.RequestName ) )

    byTarget = {}
    for ftsFile in toRegister:
      if ftsFile.TargetSE not in byTarget:
        byTarget.setdefault( ftsFile.TargetSE, [] )
      byTarget[ftsFile.TargetSE].append( ftsFile )
    log.info( "will create %s 'RegisterReplica' operations" % len( byTarget ) )

    for target, ftsFileList in byTarget.iteritems():
      log.info( "creating 'RegisterReplica' operation for targetSE %s with %s files..." % ( target,
                                                                                            len( ftsFileList ) ) )
      registerOperation = Operation()
      registerOperation.Type = "RegisterReplica"
      registerOperation.Status = "Waiting"
      registerOperation.TargetSE = target
      targetSE = StorageElement( target )
      for ftsFile in ftsFileList:
        opFile = File()
        opFile.LFN = ftsFile.LFN
        pfn = returnSingleResult( targetSE.getURL( ftsFile.LFN, protocol = self.registrationProtocols ) )
        if not pfn["OK"]:
          continue
        opFile.PFN = pfn["Value"]
        registerOperation.addFile( opFile )
      request.insertBefore( registerOperation, operation )

    return S_OK()

  @staticmethod
  def __sendAccounting( ftsJob, ownerDN ):
    """ prepare and send DataOperation to AccouringDB """

    dataOp = DataOperation()
    dataOp.setStartTime( fromString( ftsJob.SubmitTime ) )
    dataOp.setEndTime( fromString( ftsJob.LastUpdate ) )

    accountingDict = dict()
    accountingDict["OperationType"] = "ReplicateAndRegister"

    username = getUsernameForDN( ownerDN )
    if not username["OK"]:
      username = ownerDN
    else:
      username = username["Value"]

    accountingDict["User"] = username
    accountingDict["Protocol"] = "FTS3" if 'fts3' in ftsJob.FTSServer.lower() else 'FTS'
    accountingDict['ExecutionSite'] = ftsJob.FTSServer

    accountingDict['RegistrationTime'] = ftsJob._regTime
    accountingDict['RegistrationOK'] = ftsJob._regSuccess
    accountingDict['RegistrationTotal'] = ftsJob._regTotal

    accountingDict["TransferOK"] = len( [ f for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES ] )
    accountingDict["TransferTotal"] = len( ftsJob )
    accountingDict["TransferSize"] = ftsJob.Size - ftsJob.FailedSize
    accountingDict["FinalStatus"] = ftsJob.Status
    accountingDict["Source"] = ftsJob.SourceSE
    accountingDict["Destination"] = ftsJob.TargetSE

    # dt = ftsJob.LastUpdate - ftsJob.SubmitTime
    # transferTime = dt.days * 86400 + dt.seconds
    # accountingDict["TransferTime"] = transferTime
    accountingDict['TransferTime'] = sum( int( f._duration ) for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES )
    dataOp.setValuesFromDict( accountingDict )
    dataOp.commit()

  def __checkReadyReplicas( self, request, operation ):
    """ check ready replicas for transferOperation """
    log = self.log.getSubLogger( "req_%s/%s/checkReadyReplicas" % ( request.RequestID, request.RequestName ) )

    targetSESet = set( operation.targetSEList )

    # # { LFN: [ targetSE, ... ] }
    missingReplicas = {}

    scheduledFiles = dict( ( opFile.LFN, opFile ) for opFile in operation if opFile.Status in ( "Scheduled", "Waiting" ) )
    # # get replicas
    replicas = FileCatalog().getReplicas( scheduledFiles.keys() )
    if not replicas["OK"]:
      self.log.error( replicas["Message"] )
      return replicas
    replicas = replicas["Value"]

    fullyReplicated = 0
    missingSEs = {}
    for successfulLFN in replicas["Successful"]:
      reps = set( replicas['Successful'][successfulLFN] )
      if targetSESet.issubset( reps ):
        log.verbose( "%s has been replicated to all targets" % successfulLFN )
        fullyReplicated += 1
        scheduledFiles[successfulLFN].Status = "Done"
      else:
        missingReplicas[successfulLFN] = sorted( targetSESet - reps )
        ses = ",".join( missingReplicas[ successfulLFN ] )
        missingSEs[ses] = missingSEs.setdefault( ses, 0 ) + 1
        log.verbose( "%s is still missing at %s" % ( successfulLFN, ses ) )
    if fullyReplicated:
      log.info( "%d new files have been replicated to all targets" % fullyReplicated )
    if missingSEs:
      for ses in missingSEs:
        log.info( "%d replicas still missing at %s" % ( missingSEs[ses], ses ) )

    reMissing = re.compile( "no such file or directory" )
    for failedLFN, errStr in replicas["Failed"].iteritems():
      scheduledFiles[failedLFN].Error = errStr
      if reMissing.search( errStr.lower() ):
        log.error( "%s is missing, setting its status to 'Failed'" % failedLFN )
        scheduledFiles[failedLFN].Status = "Failed"
      else:
        log.warn( "unable to read replicas for %s: %s" % ( failedLFN, errStr ) )

    return S_OK( missingReplicas )

  def __filterReplicas( self, opFile ):
    """ filter out banned/invalid source SEs """
    from DIRAC.DataManagementSystem.Agent.RequestOperations.ReplicateAndRegister import filterReplicas
    return filterReplicas( opFile, logger = self.log, dataManager = self.dataManager )
コード例 #5
0
ファイル: Service.py プロジェクト: IgorPelevanyuk/DIRAC
class Service:

  SVC_VALID_ACTIONS = { 'RPC' : 'export',
                        'FileTransfer': 'transfer',
                        'Message' : 'msg',
                        'Connection' : 'Message' }
  SVC_SECLOG_CLIENT = SecurityLogClient()

  def __init__( self, serviceData ):
    self._svcData = serviceData
    self._name = serviceData[ 'loadName' ]
    self._startTime = Time.dateTime()
    self._validNames = [ serviceData[ 'modName' ]  ]
    if serviceData[ 'loadName' ] not in self._validNames:
      self._validNames.append( serviceData[ 'loadName' ] )
    self._cfg = ServiceConfiguration( list( self._validNames ) )
    if serviceData[ 'standalone' ]:
      self._monitor = gMonitor
    else:
      self._monitor = MonitoringClient()
    self.__monitorLastStatsUpdate = time.time()
    self._stats = { 'queries' : 0, 'connections' : 0 }
    self._authMgr = AuthManager( "%s/Authorization" % PathFinder.getServiceSection( serviceData[ 'loadName' ] ) )
    self._transportPool = getGlobalTransportPool()
    self.__cloneId = 0
    self.__maxFD = 0

  def setCloneProcessId( self, cloneId ):
    self.__cloneId = cloneId
    self._monitor.setComponentName( "%s-Clone:%s" % ( self._name, cloneId ) )

  def _isMetaAction( self, action ):
    referedAction = Service.SVC_VALID_ACTIONS[ action ]
    if referedAction in Service.SVC_VALID_ACTIONS:
      return referedAction
    return False

  def initialize( self ):
    #Build the URLs
    self._url = self._cfg.getURL()
    if not self._url:
      return S_ERROR( "Could not build service URL for %s" % self._name )
    gLogger.verbose( "Service URL is %s" % self._url )
    #Load handler
    result = self._loadHandlerInit()
    if not result[ 'OK' ]:
      return result
    self._handler = result[ 'Value' ]
    #Initialize lock manager
    self._lockManager = LockManager( self._cfg.getMaxWaitingPetitions() )
    self._initMonitoring()
    self._threadPool = ThreadPool( 1,
                                    max( 0, self._cfg.getMaxThreads() ),
                                    self._cfg.getMaxWaitingPetitions() )
    self._threadPool.daemonize()
    self._msgBroker = MessageBroker( "%sMSB" % self._name, threadPool = self._threadPool )
    #Create static dict
    self._serviceInfoDict = { 'serviceName' : self._name,
                              'serviceSectionPath' : PathFinder.getServiceSection( self._name ),
                              'URL' : self._cfg.getURL(),
                              'messageSender' : MessageSender( self._name, self._msgBroker ),
                              'validNames' : self._validNames,
                              'csPaths' : [ PathFinder.getServiceSection( svcName ) for svcName in self._validNames ]
                             }
    #Call static initialization function
    try:
      self._handler[ 'class' ]._rh__initializeClass( dict( self._serviceInfoDict ),
                                                     self._lockManager,
                                                     self._msgBroker,
                                                     self._monitor )
      if self._handler[ 'init' ]:
        for initFunc in self._handler[ 'init' ]:
          gLogger.verbose( "Executing initialization function" )
          try:
            result = initFunc( dict( self._serviceInfoDict ) )
          except Exception, excp:
            gLogger.exception( "Exception while calling initialization function" )
            return S_ERROR( "Exception while calling initialization function: %s" % str( excp ) )
          if not isReturnStructure( result ):
            return S_ERROR( "Service initialization function %s must return S_OK/S_ERROR" % initFunc )
          if not result[ 'OK' ]:
            return S_ERROR( "Error while initializing %s: %s" % ( self._name, result[ 'Message' ] ) )
    except Exception, e:
      errMsg = "Exception while initializing %s" % self._name
      gLogger.exception( errMsg )
      return S_ERROR( errMsg )

    #Load actions after the handler has initialized itself
    result = self._loadActions()
    if not result[ 'OK' ]:
      return result
    self._actions = result[ 'Value' ]

    gThreadScheduler.addPeriodicTask( 30, self.__reportThreadPoolContents )

    return S_OK()
コード例 #6
0
class OutputDataExecutor:
    def __init__(self, csPath=""):
        self.log = gLogger.getSubLogger("OutputDataExecutor")
        if not csPath:
            vo = gConfig.getValue("/DIRAC/VirtualOrganization", "")
            self.__transfersCSPath = '/Operations/%s/OutputData' % vo
        else:
            self.__transfersCSPath = csPath
        self.log.verbose("Reading transfer paths from %s" %
                         self.__transfersCSPath)
        self.__requiredCSOptions = [
            'InputPath', 'InputFC', 'OutputPath', 'OutputFC', 'OutputSE'
        ]

        self.__threadPool = ThreadPool(
            gConfig.getValue("%s/MinTransfers" % self.__transfersCSPath, 1),
            gConfig.getValue("%s/MaxTransfers" % self.__transfersCSPath, 4),
            gConfig.getValue("%s/MaxQueuedTransfers" % self.__transfersCSPath,
                             100))
        self.__threadPool.daemonize()
        self.__processingFiles = set()
        self.__okTransferredFiles = 0
        self.__okTransferredBytes = 0
        self.__failedFiles = {}

    def getNumOKTransferredFiles(self):
        return self.__okTransferredFiles

    def getNumOKTransferredBytes(self):
        return self.__okTransferredBytes

    def transfersPending(self):
        return self.__threadPool.isWorking()

    def getDefinedTransferPaths(self):
        result = gConfig.getSections(self.__transfersCSPath)
        if not result['OK']:
            self.log.info('No Input/Output Pair defined in CS')
            return S_OK()

        pathList = result['Value']

        tPaths = {}
        for name in pathList:
            csPath = self.__transfersCSPath + '/%s' % name
            result = gConfig.getOptionsDict(csPath)
            if not result['OK']:
                continue
            transferDict = result['Value']
            ok = True
            for i in self.__requiredCSOptions:
                if i not in transferDict:
                    self.log.error('Missing Option %s in %s' % (i, csPath))
                    ok = False
                    break
            if not ok:
                continue
            tPaths[name] = transferDict

        return S_OK(tPaths)

    def getNumLocalOutgoingFiles(self):
        result = self.getDefinedTransferPaths()
        if not result['OK']:
            return 0
        localOutgoing = 0
        tPaths = result['Value']
        for name in tPaths:
            transferDict = tPaths[name]
            if 'LocalDisk' != transferDict['InputFC']:
                continue
            localOutgoing += len(self.getOutgoingFiles(transferDict))
        return localOutgoing

    def getOutgoingFiles(self, transferDict):
        """
    Get list of files to be processed from InputPath
    """
        inputFCName = transferDict['InputFC']
        inputPath = transferDict['InputPath']

        if inputFCName == 'LocalDisk':
            files = []
            try:
                for file in os.listdir(inputPath):
                    if os.path.isfile(os.path.join(inputPath, file)):
                        files.append(file)
            except:
                pass
            return files

        inputFC = FileCatalog([inputFCName])
        result = inputFC.listDirectory(inputPath, True)

        if not result['OK']:
            self.log.error(result['Message'])
            return []
        if not inputPath in result['Value']['Successful']:
            self.log.error(result['Value']['Failed'][inputPath])
            return []

        subDirs = result['Value']['Successful'][inputPath]['SubDirs']
        files = result['Value']['Successful'][inputPath]['Files']
        for dir in subDirs:
            self.log.info('Ignoring subdirectory:', dir)
        return files.keys()

    def checkForTransfers(self):
        """
    Check for transfers to do and start them
    """
        result = self.getDefinedTransferPaths()
        if not result['OK']:
            return result
        tPaths = result['Value']
        for name in tPaths:
            transferPath = tPaths[name]
            self.log.verbose("Checking %s transfer path" % name)
            filesToTransfer = self.getOutgoingFiles(tPaths[name])
            self.log.info("Transfer path %s has %d files" %
                          (name, len(filesToTransfer)))
            ret = self.__addFilesToThreadPool(filesToTransfer, transferPath)
            if not ret['OK']:
                # The thread pool got full
                break

    def processAllPendingTransfers(self):
        self.__threadPool.processAllResults()

    @transferSync
    def __addFilesToThreadPool(self, files, transferDict):
        for file in files:
            file = os.path.basename(file)
            if file in self.__processingFiles:
                continue
            self.__processingFiles.add(file)
            time.sleep(1)
            ret = self.__threadPool.generateJobAndQueueIt(
                self.__transferIfNotRegistered,
                args=(file, transferDict),
                oCallback=self.transferCallback,
                blocking=False)
            if not ret['OK']:
                # The thread pool got full
                return ret
        return S_OK()

    def __transferIfNotRegistered(self, file, transferDict):
        result = self.isRegisteredInOutputCatalog(file, transferDict)
        if not result['OK']:
            self.log.error(result['Message'])
            return result
        #Already registered. Need to delete
        if result['Value']:
            self.log.info(
                "Transfer file %s is already registered in the output catalog"
                % file)
            #Delete
            filePath = os.path.join(transferDict['InputPath'], file)
            if transferDict['InputFC'] == 'LocalDisk':
                os.unlink(filePath)
            else:
                inputFC = FileCatalog([transferDict['InputFC']])
                replicaDict = inputFC.getReplicas(filePath)
                if not replicaDict['OK']:
                    self.log.error("Error deleting file",
                                   replicaDict['Message'])
                elif not inFile in replicaDict['Value']['Successful']:
                    self.log.error("Error deleting file",
                                   replicaDict['Value']['Failed'][inFile])
                else:
                    seList = replicaDict['Value']['Successful'][inFile].keys()
                    for se in seList:
                        se = StorageElement(se)
                        self.log.info('Removing from %s:' % se.name, inFile)
                        se.removeFile(inFile)
                    inputFC.removeFile(file)
            self.log.info("File %s deleted from %s" %
                          (file, transferDict['InputFC']))
            self.__processingFiles.discard(file)
            return S_OK(file)
        #Do the transfer
        return self.__retrieveAndUploadFile(file, transferDict)

    def isRegisteredInOutputCatalog(self, file, transferDict):
        fc = FileCatalog([transferDict['OutputFC']])
        lfn = os.path.join(transferDict['OutputPath'], os.path.basename(file))
        result = fc.getReplicas(lfn)
        if not result['OK']:
            return result
        if lfn not in result['Value']['Successful']:
            return S_OK(False)
        replicas = result['Value']['Successful'][lfn]
        for seName in List.fromChar(transferDict['OutputSE'], ","):
            if seName in replicas:
                self.log.verbose(
                    "Transfer file %s is already registered in %s SE" %
                    (file, seName))
                return S_OK(True)
        return S_OK(False)

    def __retrieveAndUploadFile(self, file, outputDict):
        """
    Retrieve, Upload, and remove
    """
        fileName = file
        inputPath = outputDict['InputPath']
        inputFCName = outputDict['InputFC']
        inBytes = 0
        if inputFCName == 'LocalDisk':
            inFile = file
            file = os.path.join(inputPath, file)
        else:
            inputFC = FileCatalog([inputFCName])

            inFile = os.path.join(inputPath, file)
            replicaDict = inputFC.getReplicas(inFile)
            if not replicaDict['OK']:
                self.log.error(replicaDict['Message'])
                return S_ERROR(fileName)
            if not inFile in replicaDict['Value']['Successful']:
                self.log.error(replicaDict['Value']['Failed'][inFile])
                return S_ERROR(fileName)
            seList = replicaDict['Value']['Successful'][inFile].keys()

            inputSE = StorageElement(seList[0])
            self.log.info('Retrieving from %s:' % inputSE.name, inFile)
            # ret = inputSE.getFile( inFile )
            # lcg_util binding prevent multithreading, use subprocess instead
            res = pythonCall(2 * 3600, inputSE.getFile, inFile)
            if not res['OK']:
                self.log.error(res['Message'])
                return S_ERROR(fileName)
            ret = res['Value']
            if not ret['OK']:
                self.log.error(ret['Message'])
                return S_ERROR(fileName)
            if not inFile in ret['Value']['Successful']:
                self.log.error(ret['Value']['Failed'][inFile])
                return S_ERROR(fileName)

        if os.path.isfile(file):
            inBytes = os.stat(file)[6]

        outputPath = outputDict['OutputPath']
        outputFCName = outputDict['OutputFC']
        replicaManager = ReplicaManager()
        outFile = os.path.join(outputPath, os.path.basename(file))
        transferOK = False
        for outputSEName in List.fromChar(outputDict['OutputSE'], ","):
            outputSE = StorageElement(outputSEName)
            self.log.info('Trying to upload to %s:' % outputSE.name, outFile)
            # ret = replicaManager.putAndRegister( outFile, os.path.realpath( file ), outputSE.name, catalog=outputFCName )
            # lcg_util binding prevent multithreading, use subprocess instead
            result = pythonCall(2 * 3600,
                                replicaManager.putAndRegister,
                                outFile,
                                os.path.realpath(file),
                                outputSE.name,
                                catalog=outputFCName)
            if result['OK'] and result['Value']['OK']:
                if outFile in result['Value']['Value']['Successful']:
                    transferOK = True
                    break
                else:
                    self.log.error(result['Value']['Value']['Failed'][outFile])
            else:
                if result['OK']:
                    self.log.error(result['Value']['Message'])
                else:
                    self.log.error(result['Message'])

        if not transferOK:
            return S_ERROR(fileName)

        if result['OK'] or not inputFCName == 'LocalDisk':
            os.unlink(file)

        if not result['OK']:
            self.log.error(ret['Message'])
            return S_ERROR(fileName)

        self.log.info("Finished transferring %s [%s bytes]" %
                      (inFile, inBytes))
        self.__okTransferredFiles += 1
        self.__okTransferredBytes += inBytes

        if inputFCName == 'LocalDisk':
            return S_OK(fileName)

        # Now the file is on final SE/FC, remove from input SE/FC
        for se in seList:
            se = StorageElement(se)
            self.log.info('Removing from %s:' % se.name, inFile)
            se.removeFile(inFile)

        inputFC.removeFile(inFile)

        return S_OK(fileName)

    @transferSync
    def transferCallback(self, threadedJob, submitResult):
        if not submitResult['OK']:
            file = submitResult['Message']
            if file not in self.__failedFiles:
                self.__failedFiles[file] = 0
            self.__failedFiles[file] += 1
        else:
            file = submitResult['Value']
            if file in self.__failedFiles:
                del self.__failedFiles[file]
        #Take out from processing files
        if file in self.__processingFiles:
            self.__processingFiles.discard(file)
コード例 #7
0
class FTSAgent(AgentModule):
    """
  .. class:: FTSAgent

  Agent propagating Scheduled request to Done or Failed state in the FTS system.

  Requests and associated FTSJobs (and so FTSFiles) are kept in cache.

  """
    # # fts graph refresh in seconds
    FTSGRAPH_REFRESH = FTSHistoryView.INTERVAL / 2
    # # SE R/W access refresh in seconds
    RW_REFRESH = 600
    # # placeholder for max job per channel
    MAX_ACTIVE_JOBS = 50
    # # min threads
    MIN_THREADS = 1
    # # max threads
    MAX_THREADS = 10
    # # files per job
    MAX_FILES_PER_JOB = 100
    # # MAX FTS transfer per FTSFile
    MAX_ATTEMPT = 256
    # # stage flag
    STAGE_FILES = False
    # # replica manager
    __replicaManager = None
    # # placeholder for FTS client
    __ftsClient = None
    # # placeholder for request client
    __requestClient = None
    # # placeholder for resources helper
    __resources = None
    # # placeholder for RSS client
    __rssClient = None
    # # placeholder for FTSGraph
    __ftsGraph = None
    # # graph regeneration time delta
    __ftsGraphValidStamp = None
    # # r/w access valid stamp
    __rwAccessValidStamp = None
    # # placeholder for threadPool
    __threadPool = None
    # # update lock
    __updateLock = None
    # # se cache
    __seCache = dict()
    # # request cache
    __reqCache = dict()

    def updateLock(self):
        """ update lock """
        if not self.__updateLock:
            self.__updateLock = LockRing().getLock("FTSAgentLock")
        return self.__updateLock

    @classmethod
    def requestClient(cls):
        """ request client getter """
        if not cls.__requestClient:
            cls.__requestClient = ReqClient()
        return cls.__requestClient

    @classmethod
    def ftsClient(cls):
        """ FTS client """
        if not cls.__ftsClient:
            cls.__ftsClient = FTSClient()
        return cls.__ftsClient

    @classmethod
    def replicaManager(cls):
        """ replica manager getter """
        if not cls.__replicaManager:
            cls.__replicaManager = ReplicaManager()
        return cls.__replicaManager

    @classmethod
    def rssClient(cls):
        """ RSS client getter """
        if not cls.__rssClient:
            cls.__rssClient = ResourceStatus()
        return cls.__rssClient

    @classmethod
    def getSE(cls, seName):
        """ keep SEs in cache """
        if seName not in cls.__seCache:
            cls.__seCache[seName] = StorageElement(seName)
        return cls.__seCache[seName]

    @classmethod
    def getRequest(cls, reqName):
        """ keep Requests in cache """
        if reqName not in cls.__reqCache:
            getRequest = cls.requestClient().getRequest(reqName)
            if not getRequest["OK"]:
                return getRequest
            getRequest = getRequest["Value"]
            if not getRequest:
                return S_ERROR("request of name '%s' not found in ReqDB" %
                               reqName)
            cls.__reqCache[reqName] = getRequest

        return S_OK(cls.__reqCache[reqName])

    @classmethod
    def putRequest(cls, request):
        """ put request back to ReqDB

    :param Request request: Request instance

    also finalize request if status == Done
    """
        # # put back request
        put = cls.requestClient().putRequest(request)
        if not put["OK"]:
            return put
        # # finalize first is possible
        if request.Status == "Done" and request.JobID:
            finalizeRequest = cls.requestClient().finalizeRequest(
                request.RequestName, request.JobID)
            if not finalizeRequest["OK"]:
                request.Status = "Scheduled"
        # # del request from cache
        if request.RequestName in cls.__reqCache:
            del cls.__reqCache[request.RequestName]
        return S_OK()

    @classmethod
    def putFTSJobs(cls, ftsJobsList):
        """ put back fts jobs to the FTSDB """
        for ftsJob in ftsJobsList:
            put = cls.ftsClient().putFTSJob(ftsJob)
            if not put["OK"]:
                return put
        return S_OK()

    @staticmethod
    def updateFTSFileDict(ftsFilesDict, toUpdateDict):
        """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """
        for category, ftsFileList in ftsFilesDict.items():
            for ftsFile in toUpdateDict.get(category, []):
                if ftsFile not in ftsFileList:
                    ftsFileList.append(ftsFile)
        return ftsFilesDict


#  def resources( self ):
#    """ resource helper getter """
#    if not self.__resources:
#      self.__resources = Resources()
#    return self.__resources

    def threadPool(self):
        """ thread pool getter """
        if not self.__threadPool:
            self.__threadPool = ThreadPool(self.MIN_THREADS, self.MAX_THREADS)
            self.__threadPool.daemonize()
        return self.__threadPool

    def resetFTSGraph(self):
        """ create fts graph """
        log = gLogger.getSubLogger("ftsGraph")

        ftsHistory = self.ftsClient().getFTSHistory()
        if not ftsHistory["OK"]:
            log.error("unable to get FTS history: %s" % ftsHistory["Message"])
            return ftsHistory
        ftsHistory = ftsHistory["Value"]

        try:
            self.updateLock().acquire()
            self.__ftsGraph = FTSGraph("FTSGraph", ftsHistory)
        finally:
            self.updateLock().release()

        log.debug("FTSSites: %s" % len(self.__ftsGraph.nodes()))
        for i, site in enumerate(self.__ftsGraph.nodes()):
            log.debug(" [%02d] FTSSite: %-25s FTSServer: %s" %
                      (i, site.name, site.FTSServer))
        log.debug("FTSRoutes: %s" % len(self.__ftsGraph.edges()))
        for i, route in enumerate(self.__ftsGraph.edges()):
            log.debug(
                " [%02d] FTSRoute: %-25s Active FTSJobs (Max) = %s (%s)" %
                (i, route.routeName, route.ActiveJobs,
                 route.toNode.MaxActiveJobs))
        # # save graph stamp
        self.__ftsGraphValidStamp = datetime.datetime.now(
        ) + datetime.timedelta(seconds=self.FTSGRAPH_REFRESH)

        # # refresh SE R/W access
        try:
            self.updateLock().acquire()
            self.__ftsGraph.updateRWAccess()
        finally:
            self.updateLock().release()
        # # save rw access stamp
        self.__rwAccessValidStamp = datetime.datetime.now(
        ) + datetime.timedelta(seconds=self.RW_REFRESH)

        return S_OK()

    def initialize(self):
        """ agent's initialization """

        log = self.log.getSubLogger("initialize")

        self.FTSGRAPH_REFRESH = self.am_getOption("FTSGraphValidityPeriod",
                                                  self.FTSGRAPH_REFRESH)
        log.info("FTSGraph validity period       = %s s" %
                 self.FTSGRAPH_REFRESH)
        self.RW_REFRESH = self.am_getOption("RWAccessValidityPeriod",
                                            self.RW_REFRESH)
        log.info("SEs R/W access validity period = %s s" % self.RW_REFRESH)

        self.STAGE_FILES = self.am_getOption("StageFiles", self.STAGE_FILES)
        log.info("Stage files before submission  = %s" % {
            True: "yes",
            False: "no"
        }[bool(self.STAGE_FILES)])

        self.MAX_ACTIVE_JOBS = self.am_getOption("MaxActiveJobsPerRoute",
                                                 self.MAX_ACTIVE_JOBS)
        log.info("Max active FTSJobs/route       = %s" % self.MAX_ACTIVE_JOBS)
        self.MAX_FILES_PER_JOB = self.am_getOption("MaxFilesPerJob",
                                                   self.MAX_FILES_PER_JOB)
        log.info("Max FTSFiles/FTSJob            = %d" %
                 self.MAX_FILES_PER_JOB)

        self.MAX_ATTEMPT = self.am_getOption("MaxTransferAttempts",
                                             self.MAX_ATTEMPT)
        log.info("Max transfer attempts          = %s" % self.MAX_ATTEMPT)

        # # thread pool
        self.MIN_THREADS = self.am_getOption("MinThreads", self.MIN_THREADS)
        self.MAX_THREADS = self.am_getOption("MaxThreads", self.MAX_THREADS)
        minmax = (abs(self.MIN_THREADS), abs(self.MAX_THREADS))
        self.MIN_THREADS, self.MAX_THREADS = min(minmax), max(minmax)
        log.info("ThreadPool min threads         = %s" % self.MIN_THREADS)
        log.info("ThreadPool max threads         = %s" % self.MAX_THREADS)

        log.info("initialize: creation of FTSGraph...")
        createGraph = self.resetFTSGraph()
        if not createGraph["OK"]:
            log.error("initialize: %s" % createGraph["Message"])
            return createGraph

        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption('shifterProxy', 'DataManager')
        log.info("will use DataManager proxy")

        # # gMonitor stuff here
        gMonitor.registerActivity("RequestsAtt",
                                  "Attempted requests executions", "FTSAgent",
                                  "Requests/min", gMonitor.OP_SUM)
        gMonitor.registerActivity("RequestsOK",
                                  "Successful requests executions", "FTSAgent",
                                  "Requests/min", gMonitor.OP_SUM)
        gMonitor.registerActivity("RequestsFail", "Failed requests executions",
                                  "FTSAgent", "Requests/min", gMonitor.OP_SUM)

        gMonitor.registerActivity("FTSJobsSubAtt", "FTSJobs creation attempts",
                                  "FTSAgent", "Created FTSJobs/min",
                                  gMonitor.OP_SUM)
        gMonitor.registerActivity("FTSJobsSubOK",
                                  "FTSJobs submitted successfully", "FTSAgent",
                                  "Successful FTSJobs submissions/min",
                                  gMonitor.OP_SUM)
        gMonitor.registerActivity("FTSJobsSubFail",
                                  "FTSJobs submissions failed", "FTSAgent",
                                  "Failed FTSJobs submissions/min",
                                  gMonitor.OP_SUM)

        gMonitor.registerActivity("FTSJobsMonAtt", "FTSJobs monitored",
                                  "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM)
        gMonitor.registerActivity("FTSJobsMonOK",
                                  "FTSJobs monitored successfully", "FTSAgent",
                                  "FTSJobs/min", gMonitor.OP_SUM)
        gMonitor.registerActivity("FTSJobsMonFail", "FTSJobs attempts failed",
                                  "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM)

        gMonitor.registerActivity("FTSMonitorFail",
                                  "Failed FTS monitor executions", "FTSAgent",
                                  "Execution/mins", gMonitor.OP_SUM)

        pollingTime = self.am_getOption("PollingTime", 60)
        for status in list(FTSJob.INITSTATES + FTSJob.TRANSSTATES +
                           FTSJob.FAILEDSTATES + FTSJob.FINALSTATES):
            gMonitor.registerActivity("FTSJobs%s" % status,
                                      "FTSJobs %s" % status, "FTSAgent",
                                      "FTSJobs/cycle", gMonitor.OP_ACUM,
                                      pollingTime)

        gMonitor.registerActivity("FtSJobsPerRequest",
                                  "Average FTSJobs per request", "FTSAgent",
                                  "FTSJobs/Request", gMonitor.OP_MEAN)
        gMonitor.registerActivity("FTSFilesPerJob", "FTSFiles per FTSJob",
                                  "FTSAgent", "Number of FTSFiles per FTSJob",
                                  gMonitor.OP_MEAN)
        gMonitor.registerActivity("FTSSizePerJob",
                                  "Average FTSFiles size per FTSJob",
                                  "FTSAgent",
                                  "Average submitted size per FTSJob",
                                  gMonitor.OP_MEAN)
        return S_OK()

    def finalize(self):
        """ finalize processing """
        log = self.log.getSubLogger("finalize")
        for request in self.__reqCache.values():
            put = self.requestClient().putRequest(request)
            if not put["OK"]:
                log.error("unable to put back request '%s': %s" %
                          (request.RequestName, put["Message"]))
        return S_OK()

    def execute(self):
        """ one cycle execution """
        log = gLogger.getSubLogger("execute")
        # # reset FTSGraph if expired
        now = datetime.datetime.now()
        if now > self.__ftsGraphValidStamp:
            log.info("resetting expired FTS graph...")
            resetFTSGraph = self.resetFTSGraph()
            if not resetFTSGraph["OK"]:
                log.error("FTSGraph recreation error: %s" %
                          resetFTSGraph["Message"])
                return resetFTSGraph
            self.__ftsGraphValidStamp = now + datetime.timedelta(
                seconds=self.FTSGRAPH_REFRESH)
        # # update R/W access in FTSGraph if expired
        if now > self.__rwAccessValidStamp:
            log.info("updating expired R/W access for SEs...")
            try:
                self.updateLock().acquire()
                self.__ftsGraph.updateRWAccess()
            finally:
                self.updateLock().release()
                self.__rwAccessValidStamp = now + datetime.timedelta(
                    seconds=self.RW_REFRESH)

        requestNames = self.requestClient().getRequestNamesList(["Scheduled"])
        if not requestNames["OK"]:
            log.error("unable to read scheduled request names: %s" %
                      requestNames["Message"])
            return requestNames
        if not requestNames["Value"]:
            requestNames = self.__reqCache.keys()
        else:
            requestNames = [req[0] for req in requestNames["Value"]]
            requestNames = list(set(requestNames + self.__reqCache.keys()))

        if not requestNames:
            log.info("no 'Scheduled' requests to process")
            return S_OK()

        log.info("found %s requests to process:" % len(requestNames))
        log.info(" => from internal cache: %s" % (len(self.__reqCache)))
        log.info(" =>   new read from RMS: %s" %
                 (len(requestNames) - len(self.__reqCache)))

        for requestName in requestNames:
            request = self.getRequest(requestName)
            if not request["OK"]:
                log.error(request["Message"])
                continue
            request = request["Value"]
            sTJId = request.RequestName
            while True:
                queue = self.threadPool().generateJobAndQueueIt(
                    self.processRequest, args=(request, ), sTJId=sTJId)
                if queue["OK"]:
                    log.info("request '%s' enqueued for execution" % sTJId)
                    gMonitor.addMark("RequestsAtt", 1)
                    break
                time.sleep(1)

        # # process all results
        self.threadPool().processAllResults()
        return S_OK()

    def processRequest(self, request):
        """ process one request

    :param Request request: ReqDB.Request
    """
        log = self.log.getSubLogger(request.RequestName)

        operation = request.getWaiting()
        if not operation["OK"]:
            log.error(
                "unable to find 'Scheduled' ReplicateAndRegister operation in request"
            )
            return self.putRequest(request)
        operation = operation["Value"]
        if operation.Type != "ReplicateAndRegister":
            log.error(
                "operation to be executed is not a ReplicateAndRegister but %s"
                % operation.Type)
            return self.putRequest(request)
        if operation.Status != "Scheduled":
            log.error(
                "operation in a wrong state, expecting 'Scheduled', got %s" %
                operation.Status)
            return self.putRequest(request)

        # # select  FTSJobs, by default all in TRANS_STATES and INIT_STATES
        ftsJobs = self.ftsClient().getFTSJobsForRequest(request.RequestID)
        if not ftsJobs["OK"]:
            log.error(ftsJobs["Message"])
            return ftsJobs
        ftsJobs = ftsJobs["Value"] if ftsJobs["Value"] else []

        # # dict keeping info about files to reschedule, submit, fail and register
        ftsFilesDict = dict([(k, list())
                             for k in ("toRegister", "toSubmit", "toFail",
                                       "toReschedule", "toUpdate")])

        if ftsJobs:
            log.info("==> found %s FTSJobs to monitor" % len(ftsJobs))
            # # PHASE 0 = monitor active FTSJobs
            for ftsJob in ftsJobs:
                monitor = self.__monitorJob(request, ftsJob)
                if not monitor["OK"]:
                    log.error("unable to monitor FTSJob %s: %s" %
                              (ftsJob.FTSJobID, monitor["Message"]))
                    ftsJob.Status = "Submitted"
                    continue
                ftsFilesDict = self.updateFTSFileDict(ftsFilesDict,
                                                      monitor["Value"])

            log.info("monitoring of FTSJobs completed")
            for key, ftsFiles in ftsFilesDict.items():
                if ftsFiles:
                    log.debug(" => %s FTSFiles to %s" %
                              (len(ftsFiles), key[2:].lower()))

        # # PHASE ONE - check ready replicas
        missingReplicas = self.__checkReadyReplicas(request, operation)
        if not missingReplicas["OK"]:
            log.error(missingReplicas["Message"])
        else:
            missingReplicas = missingReplicas["Value"]
            for opFile in operation:
                # Actually the condition below should never happen... Change printout for checking
                if opFile.LFN not in missingReplicas and opFile.Status != 'Done':
                    log.warn("Should be set! %s is replicated at all targets" %
                             opFile.LFN)
                    opFile.Status = "Done"

        toFail = ftsFilesDict.get("toFail", [])
        toReschedule = ftsFilesDict.get("toReschedule", [])
        toSubmit = ftsFilesDict.get("toSubmit", [])
        toRegister = ftsFilesDict.get("toRegister", [])
        toUpdate = ftsFilesDict.get("toUpdate", [])

        # # PHASE TWO = Failed files? -> make request Failed and return
        if toFail:
            log.error(
                "==> found %s 'Failed' FTSFiles, request execution cannot proceed..."
                % len(toFail))
            for opFile in operation:
                for ftsFile in toFail:
                    if opFile.FileID == ftsFile.FileID:
                        opFile.Error = ftsFile.Error
                        opFile.Status = "Failed"
            operation.Error = "%s files are missing any replicas" % len(toFail)
            # # requets.Status should be Failed at this stage "Failed"
            if request.Status == "Failed":
                request.Error = "ReplicateAndRegister %s failed" % operation.Order
                log.error("request is set to 'Failed'")
                return self.putRequest(request)

        # # PHASE THREE - update Waiting#SourceSE FTSFiles
        if toUpdate:
            log.info("==> found %s possible FTSFiles to update..." %
                     (len(toUpdate)))
            byTarget = {}
            for ftsFile in toUpdate:
                if ftsFile.TargetSE not in byTarget:
                    byTarget.setdefault(ftsFile.TargetSE, [])
                byTarget[ftsFile.TargetSE].append(ftsFile.FileID)
            for targetSE, fileIDList in byTarget.items():
                update = self.ftsClient().setFTSFilesWaiting(
                    operation.OperationID, targetSE, fileIDList)
                if not update["OK"]:
                    log.error("update FTSFiles failed: %s" % update["Message"])
                    continue

        # # PHASE FOUR - add 'RegisterReplica' Operations
        if toRegister:
            log.info(
                "==> found %s Files waiting for registration, adding 'RegisterReplica' operations"
            )
            registerFiles = self.__register(request, operation, toRegister)
            if not registerFiles["OK"]:
                log.error("unable to create 'RegisterReplica' operations: %s" %
                          registerFiles["Message"])
            if request.Status == "Waiting":
                log.info(
                    "request is in 'Waiting' state, will put it back to RMS")
                return self.putRequest(request)

        # # PHASE FIVE - reschedule operation files
        if toReschedule:
            log.info("==> found %s Files to reschedule" % len(toReschedule))
            rescheduleFiles = self.__reschedule(request, operation,
                                                toReschedule)
            if not rescheduleFiles["OK"]:
                log.error(rescheduleFiles["Message"])
            if request.Status == "Waiting":
                log.info(
                    "request is in 'Waiting' state, will put it back to ReqDB")
                return self.putRequest(request)

        # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs
        ftsFiles = self.ftsClient().getFTSFilesForRequest(
            request.RequestID, ["Waiting"])
        if not ftsFiles["OK"]:
            log.error(ftsFiles["Message"])
        else:
            retryIds = list(set([ftsFile.FTSFileID for ftsFile in toSubmit]))
            for ftsFile in ftsFiles["Value"]:
                if ftsFile.FTSFileID not in retryIds:
                    toSubmit.append(ftsFile)
                    retryIds.append(ftsFile.FTSFileID)

        # # submit new ftsJobs
        if operation.Status == "Scheduled" and toSubmit:
            log.info("==> found %s FTSFiles to submit" % len(toSubmit))
            submit = self.__submit(request, operation, toSubmit)
            if not submit["OK"]:
                log.error(submit["Message"])
            else:
                ftsJobs += submit["Value"]

        # # status change? - put back request
        if request.Status != "Scheduled":
            put = self.putRequest(request)
            if not put["OK"]:
                log.error("unable to put back request: %s" % put["Message"])
                return put

        # #  put back jobs
        if ftsJobs:
            putJobs = self.putFTSJobs(ftsJobs)
            if not putJobs["OK"]:
                log.error("unable to put back FTSJobs: %s" %
                          putJobs["Message"])
                return putJobs

        return S_OK()

    def __reschedule(self, request, operation, toReschedule):
        """ reschedule list of :toReschedule: files in request for operation :operation:

    :param Request request:
    :param Operation operation:
    :param list toReschedule: list of FTSFiles
    """
        log = self.log.getSubLogger("%s/reschedule" % request.RequestName)
        log.info("found %s files to reschedule" % len(toReschedule))

        for opFile in operation:
            for ftsFile in toReschedule:
                if opFile.FileID == ftsFile.FileID:
                    opFile.Status = "Waiting"

        toSchedule = []

        # # filter files
        for opFile in operation.getWaitingFilesList():

            replicas = self.__filterReplicas(opFile)
            if not replicas["OK"]:
                continue
            replicas = replicas["Value"]

            if not replicas["Valid"] and replicas["Banned"]:
                log.warn(
                    "unable to schedule '%s', replicas only at banned SEs" %
                    opFile.LFN)
                continue

            validReplicas = replicas["Valid"]
            bannedReplicas = replicas["Banned"]

            if not validReplicas and bannedReplicas:
                log.warn(
                    "unable to schedule '%s', replicas only at banned SEs" %
                    opFile.LFN)
                continue

            if validReplicas:
                validTargets = list(
                    set(operation.targetSEList) - set(validReplicas))
                if not validTargets:
                    log.info("file %s is already present at all targets" %
                             opFile.LFN)
                    opFile.Status = "Done"
                    continue
                toSchedule.append(
                    (opFile.toJSON()["Value"], validReplicas, validTargets))

        # # do real schedule here
        if toSchedule:

            ftsSchedule = self.ftsClient().ftsSchedule(request.RequestID,
                                                       operation.OperationID,
                                                       toSchedule)
            if not ftsSchedule["OK"]:
                self.log.error(ftsSchedule["Message"])
                return ftsSchedule

            ftsSchedule = ftsSchedule["Value"]
            for fileID in ftsSchedule["Successful"]:
                for opFile in operation:
                    if fileID == opFile.FileID:
                        opFile.Status = "Scheduled"

            for fileID, reason in ftsSchedule["Failed"]:
                for opFile in operation:
                    if fileID == opFile.FileID:
                        opFile.Error = reason

        return S_OK()

    def __submit(self, request, operation, toSubmit):
        """ create and submit new FTSJobs using list of FTSFiles

    :param Request request: ReqDB.Request instance
    :param list ftsFiles: list of FTSFile instances

    :return: [ FTSJob, FTSJob, ...]
    """
        log = self.log.getSubLogger("%s/submit" % request.RequestName)

        bySourceAndTarget = {}
        for ftsFile in toSubmit:
            if ftsFile.SourceSE not in bySourceAndTarget:
                bySourceAndTarget.setdefault(ftsFile.SourceSE, {})
            if ftsFile.TargetSE not in bySourceAndTarget[ftsFile.SourceSE]:
                bySourceAndTarget[ftsFile.SourceSE].setdefault(
                    ftsFile.TargetSE, [])
            bySourceAndTarget[ftsFile.SourceSE][ftsFile.TargetSE].append(
                ftsFile)

        ftsJobs = []

        for source, targetDict in bySourceAndTarget.items():

            for target, ftsFileList in targetDict.items():

                log.info("found %s files to submit from %s to %s" %
                         (len(ftsFileList), source, target))

                route = self.__ftsGraph.findRoute(source, target)
                if not route["OK"]:
                    log.error(route["Message"])
                    continue
                route = route["Value"]

                sourceRead = route.fromNode.SEs[source]["read"]
                if not sourceRead:
                    log.error("SourceSE %s is banned for reading right now" %
                              source)
                    continue

                targetWrite = route.toNode.SEs[target]["write"]
                if not targetWrite:
                    log.error("TargetSE %s is banned for writing right now" %
                              target)
                    continue

                if route.ActiveJobs > route.toNode.MaxActiveJobs:
                    log.warn(
                        "unable to submit new FTS job, max active jobs reached"
                    )
                    continue

                # # create FTSJob
                ftsJob = FTSJob()
                ftsJob.RequestID = request.RequestID
                ftsJob.OperationID = operation.OperationID
                ftsJob.SourceSE = source
                ftsJob.TargetSE = target

                sourceSE = self.getSE(source)
                sourceToken = sourceSE.getStorageParameters("SRM2")
                if not sourceToken["OK"]:
                    log.error("unable to get sourceSE '%s' parameters: %s" %
                              (source, sourceToken["Message"]))
                    continue
                ftsJob.SourceToken = sourceToken["Value"].get("SpaceToken", "")

                targetSE = self.getSE(target)
                targetToken = targetSE.getStorageParameters("SRM2")
                if not targetToken["OK"]:
                    log.error("unable to get targetSE '%s' parameters: %s" %
                              (target, targetToken["Message"]))
                    continue
                ftsJob.TargetToken = targetToken["Value"].get("SpaceToken", "")

                ftsJob.FTSServer = route.toNode.FTSServer

                for ftsFile in ftsFileList:
                    ftsFile.Attempt += 1
                    ftsFile.Error = ""
                    ftsJob.addFile(ftsFile)

                submit = ftsJob.submitFTS2(self.STAGE_FILES)
                if not submit["OK"]:
                    log.error("unable to submit FTSJob: %s" %
                              submit["Message"])
                    continue

                log.info("FTSJob '%s'@'%s' has been submitted" %
                         (ftsJob.FTSGUID, ftsJob.FTSServer))

                # # update statuses for job files
                for ftsFile in ftsJob:
                    ftsFile.FTSGUID = ftsJob.FTSGUID
                    ftsFile.Status = "Submitted"
                    ftsFile.Attempt += 1

                # # update graph route
                try:
                    self.updateLock().acquire()
                    route.ActiveJobs += 1
                finally:
                    self.updateLock().release()

                ftsJobs.append(ftsJob)

        log.info("%s new FTSJobs have been submitted" % len(ftsJobs))
        return S_OK(ftsJobs)

    def __monitorJob(self, request, ftsJob):
        """ execute FTSJob.monitorFTS2 for a given :ftsJob:
        if ftsJob is in a final state, finalize it

    :param Request request: ReqDB.Request instance
    :param FTSJob ftsJob: FTSDB.FTSJob instance
    """
        log = self.log.getSubLogger("%s/monitor/%s" %
                                    (request.RequestName, ftsJob.FTSGUID))
        log.info("FTSJob '%s'@'%s'" % (ftsJob.FTSGUID, ftsJob.FTSServer))

        # # this will be returned
        ftsFilesDict = dict([(k, list())
                             for k in ("toRegister", "toSubmit", "toFail",
                                       "toReschedule", "toUpdate")])

        monitor = ftsJob.monitorFTS2()
        if not monitor["OK"]:
            gMonitor.addMark("FTSMonitorFail", 1)
            log.error(monitor["Message"])
            if "getTransferJobSummary2: Not authorised to query request" in monitor[
                    "Message"]:
                log.error("FTSJob not known (expired on server?)")
                for ftsFile in ftsJob:
                    ftsFile.Status = "Waiting"
                    ftsFilesDict["toSubmit"] = ftsFile
                return S_OK(ftsFilesDict)
            return monitor

        monitor = monitor["Value"]
        log.info("FTSJob Status = %s Completeness = %s" %
                 (ftsJob.Status, ftsJob.Completeness))

        # # monitor status change
        gMonitor.addMark("FTSJobs%s" % ftsJob.Status, 1)

        if ftsJob.Status in FTSJob.FINALSTATES:
            finalizeFTSJob = self.__finalizeFTSJob(request, ftsJob)
            if not finalizeFTSJob["OK"]:
                log.error(finalizeFTSJob["Message"])
                return finalizeFTSJob
            ftsFilesDict = self.updateFTSFileDict(ftsFilesDict,
                                                  finalizeFTSJob["Value"])

        return S_OK(ftsFilesDict)

    def __finalizeFTSJob(self, request, ftsJob):
        """ finalize FTSJob

    :param Request request: ReqDB.Request instance
    :param FTSJob ftsJob: FTSDB.FTSJob instance
    """
        log = self.log.getSubLogger("%s/monitor/%s/finalize" %
                                    (request.RequestName, ftsJob.FTSJobID))
        log.info("finalizing FTSJob %s@%s" %
                 (ftsJob.FTSGUID, ftsJob.FTSServer))

        # # this will be returned
        ftsFilesDict = dict([(k, list())
                             for k in ("toRegister", "toSubmit", "toFail",
                                       "toReschedule", "toUpdate")])

        monitor = ftsJob.monitorFTS2(full=True)
        if not monitor["OK"]:
            log.error(monitor["Message"])
            return monitor

        # # split FTSFiles to different categories
        processFiles = self.__filterFiles(ftsJob)
        if not processFiles["OK"]:
            log.error(processFiles["Message"])
            return processFiles
        ftsFilesDict = self.updateFTSFileDict(ftsFilesDict,
                                              processFiles["Value"])

        # # send accounting record for this job
        self.__sendAccounting(ftsJob, request.OwnerDN)

        # # update graph - remove this job from graph
        route = self.__ftsGraph.findRoute(ftsJob.SourceSE, ftsJob.TargetSE)
        if route["OK"]:
            try:
                self.updateLock().acquire()
                route["Value"].ActiveJobs -= 1
            finally:
                self.updateLock().release()

        log.info("FTSJob is finalized")

        return S_OK(ftsFilesDict)

    def __filterFiles(self, ftsJob):
        """ process ftsFiles from finished ftsJob

    :param FTSJob ftsJob: monitored FTSJob instance
    """
        # # lists for different categories
        toUpdate = []
        toReschedule = []
        toRegister = []
        toSubmit = []
        toFail = []

        # # loop over files in fts job
        for ftsFile in ftsJob:
            # # successful files
            if ftsFile.Status == "Finished":
                if ftsFile.Error == "AddCatalogReplicaFailed":
                    toRegister.append(ftsFile)
                toUpdate.append(ftsFile)
                continue
            if ftsFile.Status == "Failed":
                if ftsFile.Error == "MissingSource":
                    toReschedule.append(ftsFile)
                else:
                    if ftsFile.Attempt < self.MAX_ATTEMPT:
                        toSubmit.append(ftsFile)
                    else:
                        toFail.append(ftsFile)
                        ftsFile.Error = "Max attempts reached"

        return S_OK({
            "toUpdate": toUpdate,
            "toSubmit": toSubmit,
            "toRegister": toRegister,
            "toReschedule": toReschedule,
            "toFail": toFail
        })

    def __register(self, request, operation, toRegister):
        """ add RegisterReplica operation

    :param Request request: request instance
    :param Operation transferOp: 'ReplicateAndRegister' operation for this FTSJob
    :param list toRegister: [ FTSDB.FTSFile, ... ] - files that failed to register
    """
        log = self.log.getSubLogger("%s/registerFiles" % request.RequestName)

        byTarget = {}
        for ftsFile in toRegister:
            if ftsFile.TargetSE not in byTarget:
                byTarget.setdefault(ftsFile.TargetSE, [])
            byTarget[ftsFile.TargetSE].append(ftsFile)
        log.info("will create %s 'RegisterReplica' operations" % len(byTarget))

        for target, ftsFileList in byTarget.items():
            log.info(
                "creating 'RegisterReplica' operation for targetSE %s with %s files..."
                % (target, len(ftsFileList)))
            registerOperation = Operation()
            registerOperation.Type = "RegisterReplica"
            registerOperation.Status = "Waiting"
            registerOperation.TargetSE = target
            targetSE = self.getSE(target)
            for ftsFile in ftsFileList:
                opFile = File()
                opFile.LFN = ftsFile.LFN
                pfn = targetSE.getPfnForProtocol(ftsFile.TargetSURL,
                                                 "SRM2",
                                                 withPort=False)
                if not pfn["OK"]:
                    continue
                opFile.PFN = pfn["Value"]
                registerOperation.addFile(opFile)
            request.insertBefore(registerOperation, operation)

        return S_OK()

    @staticmethod
    def __sendAccounting(ftsJob, ownerDN):
        """ prepare and send DataOperation to AccouringDB """

        dataOp = DataOperation()
        dataOp.setStartTime(fromString(ftsJob.SubmitTime))
        dataOp.setEndTime(fromString(ftsJob.LastUpdate))

        accountingDict = dict()
        accountingDict["OperationType"] = "ReplicateAndRegister"

        username = getUsernameForDN(ownerDN)
        if not username["OK"]:
            username = ownerDN
        else:
            username = username["Value"]

        accountingDict["User"] = username
        accountingDict[
            "Protocol"] = "FTS3" if 'fts3' in ftsJob.FTSServer.lower(
            ) else 'FTS'
        accountingDict['ExecutionSite'] = ftsJob.FTSServer

        accountingDict['RegistrationTime'] = ftsJob._regTime
        accountingDict['RegistrationOK'] = ftsJob._regSuccess
        accountingDict['RegistrationTotal'] = ftsJob._regTotal

        accountingDict["TransferOK"] = len(
            [f for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES])
        accountingDict["TransferTotal"] = len(ftsJob)
        accountingDict["TransferSize"] = ftsJob.Size - ftsJob.FailedSize
        accountingDict["FinalStatus"] = ftsJob.Status
        accountingDict["Source"] = ftsJob.SourceSE
        accountingDict["Destination"] = ftsJob.TargetSE

        dt = ftsJob.LastUpdate - ftsJob.SubmitTime
        transferTime = dt.days * 86400 + dt.seconds
        accountingDict["TransferTime"] = transferTime
        # accountingDict['TransferTime'] = sum( [f._duration for f in ftsJob])
        dataOp.setValuesFromDict(accountingDict)
        dataOp.commit()

    def __checkReadyReplicas(self, request, operation):
        """ check ready replicas for transferOperation """
        log = self.log.getSubLogger("%s/checkReadyReplicas" %
                                    request.RequestName)

        targetSESet = set(operation.targetSEList)

        # # { LFN: [ targetSE, ... ] }
        missingReplicas = {}

        scheduledFiles = dict([(opFile.LFN, opFile) for opFile in operation
                               if opFile.Status in ("Scheduled", "Waiting")])
        # # get replicas
        replicas = self.replicaManager().getCatalogReplicas(
            scheduledFiles.keys())

        if not replicas["OK"]:
            self.log.error(replicas["Message"])
            return replicas
        replicas = replicas["Value"]

        fullyReplicated = 0
        missingSEs = {}
        for successfulLFN in replicas["Successful"]:
            reps = set(replicas['Successful'][successfulLFN])
            if targetSESet.issubset(reps):
                log.info("%s has been replicated to all targets" %
                         successfulLFN)
                fullyReplicated += 1
                scheduledFiles[successfulLFN].Status = "Done"
            else:
                missingReplicas[successfulLFN] = sorted(targetSESet - reps)
                ses = ",".join(missingReplicas[successfulLFN])
                missingSEs[ses] = missingSEs.setdefault(ses, 0) + 1
                log.verbose("%s is still missing at %s" % (successfulLFN, ses))
        if fullyReplicated:
            log.info("%d new files have been replicated to all targets" %
                     fullyReplicated)
        if missingSEs:
            for ses in missingSEs:
                log.info("%d replicas still missing at %s" %
                         (missingSEs[ses], ses))

        reMissing = re.compile("no such file or directory")
        for failedLFN, errStr in replicas["Failed"].items():
            scheduledFiles[failedLFN].Error = errStr
            if reMissing.search(errStr.lower()):
                log.error("%s is missing, setting its status to 'Failed'" %
                          failedLFN)
                scheduledFiles[failedLFN].Status = "Failed"
            else:
                log.warn("unable to read replicas for %s: %s" %
                         (failedLFN, errStr))

        return S_OK(missingReplicas)

    def __filterReplicas(self, opFile):
        """ filter out banned/invalid source SEs """
        log = self.log.getSubLogger("filterReplicas")

        ret = {"Valid": [], "Banned": [], "Bad": []}

        replicas = self.replicaManager().getActiveReplicas(opFile.LFN)
        if not replicas["OK"]:
            log.error(replicas["Message"])
        reNotExists = re.compile("not such file or directory")
        replicas = replicas["Value"]
        failed = replicas["Failed"].get(opFile.LFN, "")
        if reNotExists.match(failed.lower()):
            opFile.Status = "Failed"
            opFile.Error = failed
            return S_ERROR(failed)

        replicas = replicas["Successful"][
            opFile.LFN] if opFile.LFN in replicas["Successful"] else {}

        for repSEName in replicas:

            repSE = self.getSE(repSEName)

            pfn = repSE.getPfnForLfn(opFile.LFN)
            if not pfn["OK"]:
                log.warn("unable to create pfn for %s lfn: %s" %
                         (opFile.LFN, pfn["Message"]))
                ret["Banned"].append(repSEName)
                continue
            pfn = pfn["Value"]

            repSEMetadata = repSE.getFileMetadata(pfn, singleFile=True)
            if not repSEMetadata["OK"]:
                self.log.warn(repSEMetadata["Message"])
                ret["Banned"].append(repSEName)
                continue
            repSEMetadata = repSEMetadata["Value"]

            seChecksum = repSEMetadata["Checksum"].replace(
                "x", "0").zfill(8) if "Checksum" in repSEMetadata else None
            if opFile.Checksum and opFile.Checksum != seChecksum:
                self.log.warn(" %s checksum mismatch: %s %s:%s" %
                              (opFile.LFN, opFile.Checksum, repSE, seChecksum))
                ret["Bad"].append(repSEName)
                continue
            # # if we're here repSE is OK
            ret["Valid"].append(repSEName)

        return S_OK(ret)
コード例 #8
0
ファイル: Service.py プロジェクト: DIRACGrid-test/DIRAC
class Service( object ):

  SVC_VALID_ACTIONS = { 'RPC' : 'export',
                        'FileTransfer': 'transfer',
                        'Message' : 'msg',
                        'Connection' : 'Message' }
  SVC_SECLOG_CLIENT = SecurityLogClient()

  def __init__( self, serviceData ):
    self._svcData = serviceData
    self._name = serviceData[ 'modName' ]
    self._startTime = Time.dateTime()
    self._validNames = [ serviceData[ 'modName' ]  ]
    if serviceData[ 'loadName' ] not in self._validNames:
      self._validNames.append( serviceData[ 'loadName' ] )
    self._cfg = ServiceConfiguration( list( self._validNames ) )
    if serviceData[ 'standalone' ]:
      self._monitor = gMonitor
    else:
      self._monitor = MonitoringClient()
    self.__monitorLastStatsUpdate = time.time()
    self._stats = { 'queries' : 0, 'connections' : 0 }
    self._authMgr = AuthManager( "%s/Authorization" % PathFinder.getServiceSection( serviceData[ 'loadName' ] ) )
    self._transportPool = getGlobalTransportPool()
    self.__cloneId = 0
    self.__maxFD = 0

  def setCloneProcessId( self, cloneId ):
    self.__cloneId = cloneId
    self._monitor.setComponentName( "%s-Clone:%s" % ( self._name, cloneId ) )

  def _isMetaAction( self, action ):
    referedAction = Service.SVC_VALID_ACTIONS[ action ]
    if referedAction in Service.SVC_VALID_ACTIONS:
      return referedAction
    return False

  def initialize( self ):
    #Build the URLs
    self._url = self._cfg.getURL()
    if not self._url:
      return S_ERROR( "Could not build service URL for %s" % self._name )
    gLogger.verbose( "Service URL is %s" % self._url )
    #Load handler
    result = self._loadHandlerInit()
    if not result[ 'OK' ]:
      return result
    self._handler = result[ 'Value' ]
    #Initialize lock manager
    self._lockManager = LockManager( self._cfg.getMaxWaitingPetitions() )
    self._initMonitoring()
    self._threadPool = ThreadPool( max( 1, self._cfg.getMinThreads() ),
                                   max( 0, self._cfg.getMaxThreads() ),
                                   self._cfg.getMaxWaitingPetitions() )
    self._threadPool.daemonize()
    self._msgBroker = MessageBroker( "%sMSB" % self._name, threadPool = self._threadPool )
    #Create static dict
    self._serviceInfoDict = { 'serviceName' : self._name,
                              'serviceSectionPath' : PathFinder.getServiceSection( self._name ),
                              'URL' : self._cfg.getURL(),
                              'messageSender' : MessageSender( self._name, self._msgBroker ),
                              'validNames' : self._validNames,
                              'csPaths' : [ PathFinder.getServiceSection( svcName ) for svcName in self._validNames ]
                            }
    #Call static initialization function
    try:
      self._handler[ 'class' ]._rh__initializeClass( dict( self._serviceInfoDict ),
                                                     self._lockManager,
                                                     self._msgBroker,
                                                     self._monitor )
      if self._handler[ 'init' ]:
        for initFunc in self._handler[ 'init' ]:
          gLogger.verbose( "Executing initialization function" )
          try:
            result = initFunc( dict( self._serviceInfoDict ) )
          except Exception as excp:
            gLogger.exception( "Exception while calling initialization function", lException = excp )
            return S_ERROR( "Exception while calling initialization function: %s" % str( excp ) )
          if not isReturnStructure( result ):
            return S_ERROR( "Service initialization function %s must return S_OK/S_ERROR" % initFunc )
          if not result[ 'OK' ]:
            return S_ERROR( "Error while initializing %s: %s" % ( self._name, result[ 'Message' ] ) )
    except Exception as e:
      errMsg = "Exception while initializing %s" % self._name
      gLogger.exception( e )
      gLogger.exception( errMsg )
      return S_ERROR( errMsg )

    #Load actions after the handler has initialized itself
    result = self._loadActions()
    if not result[ 'OK' ]:
      return result
    self._actions = result[ 'Value' ]

    gThreadScheduler.addPeriodicTask( 30, self.__reportThreadPoolContents )

    return S_OK()

  def __searchInitFunctions( self, handlerClass, currentClass = None ):
    if not currentClass:
      currentClass = handlerClass
    initFuncs = []
    ancestorHasInit = False
    for ancestor in currentClass.__bases__:
      initFuncs += self.__searchInitFunctions( handlerClass, ancestor )
      if 'initializeHandler' in dir( ancestor ):
        ancestorHasInit = True
    if ancestorHasInit:
      initFuncs.append( super( currentClass, handlerClass ).initializeHandler )
    if currentClass == handlerClass and 'initializeHandler' in dir( handlerClass ):
      initFuncs.append( handlerClass.initializeHandler )
    return initFuncs

  def _loadHandlerInit( self ):
    handlerClass = self._svcData[ 'classObj' ]
    handlerName = handlerClass.__name__
    handlerInitMethods = self.__searchInitFunctions( handlerClass )
    try:
      handlerInitMethods.append( getattr( self._svcData[ 'moduleObj' ], "initialize%s" % handlerName ) )
    except AttributeError:
      gLogger.verbose( "Not found global initialization function for service" )

    if handlerInitMethods:
      gLogger.info( "Found %s initialization methods" % len( handlerInitMethods ) )

    handlerInfo = {}
    handlerInfo[ "name" ] = handlerName
    handlerInfo[ "module" ] = self._svcData[ 'moduleObj' ]
    handlerInfo[ "class" ] = handlerClass
    handlerInfo[ "init" ] = handlerInitMethods

    return S_OK( handlerInfo )

  def _loadActions( self ):

    handlerClass = self._handler[ 'class' ]

    authRules = {}
    typeCheck = {}
    methodsList = {}
    for actionType in Service.SVC_VALID_ACTIONS:
      if self._isMetaAction( actionType ):
        continue
      authRules[ actionType ] = {}
      typeCheck[ actionType ] = {}
      methodsList[ actionType ] = []
    handlerAttributeList = dir( handlerClass )
    for actionType in Service.SVC_VALID_ACTIONS:
      if self._isMetaAction( actionType ):
        continue
      methodPrefix = '%s_' % Service.SVC_VALID_ACTIONS[ actionType ]
      for attribute in handlerAttributeList:
        if attribute.find( methodPrefix ) != 0:
          continue
        exportedName = attribute[ len( methodPrefix ) : ]
        methodsList[ actionType ].append( exportedName )
        gLogger.verbose( "+ Found %s method %s" % ( actionType, exportedName ) )
        #Create lock for method
        self._lockManager.createLock( "%s/%s" % ( actionType, exportedName ),
                                       self._cfg.getMaxThreadsForMethod( actionType, exportedName ) )
        #Look for type and auth rules
        if actionType == 'RPC':
          typeAttr = "types_%s" % exportedName
          authAttr = "auth_%s" % exportedName
        else:
          typeAttr = "types_%s_%s" % ( Service.SVC_VALID_ACTIONS[ actionType ], exportedName )
          authAttr = "auth_%s_%s" % ( Service.SVC_VALID_ACTIONS[ actionType ], exportedName )
        if typeAttr in handlerAttributeList:
          obj = getattr( handlerClass, typeAttr )
          gLogger.verbose( "|- Found type definition %s: %s" % ( typeAttr, str( obj ) ) )
          typeCheck[ actionType ][ exportedName ] = obj
        if authAttr in handlerAttributeList:
          obj = getattr( handlerClass, authAttr )
          gLogger.verbose( "|- Found auth rules %s: %s" % ( authAttr, str( obj ) ) )
          authRules[ actionType ][ exportedName ] = obj

    for actionType in Service.SVC_VALID_ACTIONS:
      referedAction = self._isMetaAction( actionType )
      if not referedAction:
        continue
      gLogger.verbose( "Action %s is a meta action for %s" % ( actionType, referedAction ) )
      authRules[ actionType ] = []
      for method in authRules[ referedAction ]:
        for prop in authRules[ referedAction ][ method ]:
          if prop not in authRules[ actionType ]:
            authRules[ actionType ].append( prop )
      gLogger.verbose( "Meta action %s props are %s" % ( actionType, authRules[ actionType ] ) )

    return S_OK( { 'methods' : methodsList, 'auth' : authRules, 'types' : typeCheck } )

  def _initMonitoring( self ):
    #Init extra bits of monitoring
    self._monitor.setComponentType( MonitoringClient.COMPONENT_SERVICE )
    self._monitor.setComponentName( self._name )
    self._monitor.setComponentLocation( self._cfg.getURL() )
    self._monitor.initialize()
    self._monitor.registerActivity( "Connections", "Connections received", "Framework", "connections", MonitoringClient.OP_RATE )
    self._monitor.registerActivity( "Queries", "Queries served", "Framework", "queries", MonitoringClient.OP_RATE )
    self._monitor.registerActivity( 'CPU', "CPU Usage", 'Framework', "CPU,%", MonitoringClient.OP_MEAN, 600 )
    self._monitor.registerActivity( 'MEM', "Memory Usage", 'Framework', 'Memory,MB', MonitoringClient.OP_MEAN, 600 )
    self._monitor.registerActivity( 'PendingQueries', "Pending queries", 'Framework', 'queries', MonitoringClient.OP_MEAN )
    self._monitor.registerActivity( 'ActiveQueries', "Active queries", 'Framework', 'threads', MonitoringClient.OP_MEAN )
    self._monitor.registerActivity( 'RunningThreads', "Running threads", 'Framework', 'threads', MonitoringClient.OP_MEAN )
    self._monitor.registerActivity( 'MaxFD', "Max File Descriptors", 'Framework', 'fd', MonitoringClient.OP_MEAN )

    self._monitor.setComponentExtraParam( 'DIRACVersion', DIRAC.version )
    self._monitor.setComponentExtraParam( 'platform', DIRAC.getPlatform() )
    self._monitor.setComponentExtraParam( 'startTime', Time.dateTime() )
    for prop in ( ( "__RCSID__", "version" ), ( "__doc__", "description" ) ):
      try:
        value = getattr( self._handler[ 'module' ], prop[0] )
      except Exception as e:
        gLogger.exception( e )
        gLogger.error( "Missing property", prop[0] )
        value = 'unset'
      self._monitor.setComponentExtraParam( prop[1], value )
    for secondaryName in self._cfg.registerAlsoAs():
      gLogger.info( "Registering %s also as %s" % ( self._name, secondaryName ) )
      self._validNames.append( secondaryName )
    return S_OK()

  def __reportThreadPoolContents( self ):
    self._monitor.addMark( 'PendingQueries', self._threadPool.pendingJobs() )
    self._monitor.addMark( 'ActiveQueries', self._threadPool.numWorkingThreads() )
    self._monitor.addMark( 'RunningThreads', threading.activeCount() )
    self._monitor.addMark( 'MaxFD', self.__maxFD )
    self.__maxFD = 0


  def getConfig( self ):
    return self._cfg

  #End of initialization functions

  def handleConnection( self, clientTransport ):
    self._stats[ 'connections' ] += 1
    self._monitor.setComponentExtraParam( 'queries', self._stats[ 'connections' ] )
    self._threadPool.generateJobAndQueueIt( self._processInThread,
                                             args = ( clientTransport, ) )

  #Threaded process function
  def _processInThread( self, clientTransport ):
    self.__maxFD = max( self.__maxFD, clientTransport.oSocket.fileno() )
    self._lockManager.lockGlobal()
    try:
      monReport = self.__startReportToMonitoring()
    except Exception:
      monReport = False
    try:
      #Handshake
      try:
        result = clientTransport.handshake()
        if not result[ 'OK' ]:
          clientTransport.close()
          return
      except:
        return
      #Add to the transport pool
      trid = self._transportPool.add( clientTransport )
      if not trid:
        return
      #Receive and check proposal
      result = self._receiveAndCheckProposal( trid )
      if not result[ 'OK' ]:
        self._transportPool.sendAndClose( trid, result )
        return
      proposalTuple = result[ 'Value' ]
      #Instantiate handler
      result = self._instantiateHandler( trid, proposalTuple )
      if not result[ 'OK' ]:
        self._transportPool.sendAndClose( trid, result )
        return
      handlerObj = result[ 'Value' ]
      #Execute the action
      result = self._processProposal( trid, proposalTuple, handlerObj )
      #Close the connection if required
      if result[ 'closeTransport' ] or not result[ 'OK' ]:
        if not result[ 'OK' ]:
          gLogger.error( "Error processing proposal", result[ 'Message' ] )
        self._transportPool.close( trid )
      return result
    finally:
      self._lockManager.unlockGlobal()
      if monReport:
        self.__endReportToMonitoring( *monReport )


  def _createIdentityString( self, credDict, clientTransport = None ):
    if 'username' in credDict:
      if 'group' in credDict:
        identity = "[%s:%s]" % ( credDict[ 'username' ], credDict[ 'group' ] )
      else:
        identity = "[%s:unknown]" % credDict[ 'username' ]
    else:
      identity = 'unknown'
    if clientTransport:
      addr = clientTransport.getRemoteAddress()
      if addr:
        addr = "{%s:%s}" % ( addr[0], addr[1] )
    if 'DN' in credDict:
      identity += "(%s)" % credDict[ 'DN' ]
    return identity

  def _receiveAndCheckProposal( self, trid ):
    clientTransport = self._transportPool.get( trid )
    #Get the peer credentials
    credDict = clientTransport.getConnectingCredentials()
    #Receive the action proposal
    retVal = clientTransport.receiveData( 1024 )
    if not retVal[ 'OK' ]:
      gLogger.error( "Invalid action proposal", "%s %s" % ( self._createIdentityString( credDict,
                                                                                        clientTransport ),
                                                            retVal[ 'Message' ] ) )
      return S_ERROR( "Invalid action proposal" )
    proposalTuple = retVal[ 'Value' ]
    gLogger.debug( "Received action from client", "/".join( list( proposalTuple[1] ) ) )
    #Check if there are extra credentials
    if proposalTuple[2]:
      clientTransport.setExtraCredentials( proposalTuple[2] )
    #Check if this is the requested service
    requestedService = proposalTuple[0][0]
    if requestedService not in self._validNames:
      return S_ERROR( "%s is not up in this server" % requestedService )
    #Check if the action is valid
    requestedActionType = proposalTuple[1][0]
    if requestedActionType not in Service.SVC_VALID_ACTIONS:
      return S_ERROR( "%s is not a known action type" % requestedActionType )
    #Check if it's authorized
    result = self._authorizeProposal( proposalTuple[1], trid, credDict )
    if not result[ 'OK' ]:
      return result
    #Proposal is OK
    return S_OK( proposalTuple )

  def _authorizeProposal( self, actionTuple, trid, credDict ):
    #Find CS path for the Auth rules
    referedAction = self._isMetaAction( actionTuple[0] )
    if referedAction:
      csAuthPath = "%s/Default" % actionTuple[0]
      hardcodedMethodAuth = self._actions[ 'auth' ][ actionTuple[0] ]
    else:
      if actionTuple[0] == 'RPC':
        csAuthPath = actionTuple[1]
      else:
        csAuthPath = "/".join( actionTuple )
      #Find if there are hardcoded auth rules in the code
      hardcodedMethodAuth = False
      if actionTuple[0] in self._actions[ 'auth' ]:
        hardcodedRulesByType = self._actions[ 'auth' ][ actionTuple[0] ]
        if actionTuple[0] == "FileTransfer":
          methodName = actionTuple[1][0].lower() + actionTuple[1][1:]
        else:
          methodName = actionTuple[1]

        if methodName in hardcodedRulesByType:
          hardcodedMethodAuth = hardcodedRulesByType[ methodName ]
    #Auth time!
    if not self._authMgr.authQuery( csAuthPath, credDict, hardcodedMethodAuth ):
      #Get the identity string
      identity = self._createIdentityString( credDict )
      fromHost = "unknown host"
      tr = self._transportPool.get( trid )
      if tr:
        fromHost = '/'.join( [ str( item ) for item in tr.getRemoteAddress() ] )
      gLogger.warn( "Unauthorized query", "to %s:%s by %s from %s" % ( self._name,
                                                               "/".join( actionTuple ),
                                                               identity, fromHost ) )
      result = S_ERROR( "Unauthorized query" )
    else:
      result = S_OK()

    #Security log
    tr = self._transportPool.get( trid )
    if not tr:
      return S_ERROR( "Client disconnected" )
    sourceAddress = tr.getRemoteAddress()
    identity = self._createIdentityString( credDict )
    Service.SVC_SECLOG_CLIENT.addMessage( result[ 'OK' ], sourceAddress[0], sourceAddress[1], identity,
                                      self._cfg.getHostname(),
                                      self._cfg.getPort(),
                                      self._name, "/".join( actionTuple ) )
    return result

  def _instantiateHandler( self, trid, proposalTuple = None ):
    """
    Generate an instance of the handler for a given service
    """
    #Generate the client params
    clientParams = { 'serviceStartTime' : self._startTime }
    if proposalTuple:
      clientParams[ 'clientSetup' ] = proposalTuple[0][1]
      if len( proposalTuple[0] ) < 3:
        clientParams[ 'clientVO' ] = gConfig.getValue( "/DIRAC/VirtualOrganization", "unknown" )
      else:
        clientParams[ 'clientVO' ] = proposalTuple[0][2]
    clientTransport = self._transportPool.get( trid )
    if clientTransport:
      clientParams[ 'clientAddress' ] = clientTransport.getRemoteAddress()
    #Generate handler dict with per client info
    handlerInitDict = dict( self._serviceInfoDict )
    for key in clientParams:
      handlerInitDict[ key ] = clientParams[ key ]
    #Instantiate and initialize
    try:
      handlerInstance = self._handler[ 'class' ]( handlerInitDict, trid )
      handlerInstance.initialize()
    except Exception as e:
      gLogger.exception( "Server error while loading handler: %s" % str( e ) )
      return S_ERROR( "Server error while loading handler" )
    return S_OK( handlerInstance )

  def _processProposal( self, trid, proposalTuple, handlerObj ):
    #Notify the client we're ready to execute the action
    retVal = self._transportPool.send( trid, S_OK() )
    if not retVal[ 'OK' ]:
      return retVal

    messageConnection = False
    if proposalTuple[1] == ( 'Connection', 'new' ):
      messageConnection = True

    if messageConnection:

      if self._msgBroker.getNumConnections() > self._cfg.getMaxMessagingConnections():
        result = S_ERROR( "Maximum number of connections reached. Try later" )
        result[ 'closeTransport' ] = True
        return result

      #This is a stable connection
      self._msgBroker.addTransportId( trid, self._name,
                                       receiveMessageCallback = self._mbReceivedMsg,
                                       disconnectCallback = self._mbDisconnect,
                                       listenToConnection = False )

    result = self._executeAction( trid, proposalTuple, handlerObj )
    if result[ 'OK' ] and messageConnection:
      self._msgBroker.listenToTransport( trid )
      result = self._mbConnect( trid, handlerObj )
      if not result[ 'OK' ]:
        self._msgBroker.removeTransport( trid )

    result[ 'closeTransport' ] = not messageConnection or not result[ 'OK' ]
    return result

  def _mbConnect( self, trid, handlerObj = None ):
    if not handlerObj:
      result = self._instantiateHandler( trid )
      if not result[ 'OK' ]:
        return result
      handlerObj = result[ 'Value' ]
    return handlerObj._rh_executeConnectionCallback( 'connected' )

  def _executeAction( self, trid, proposalTuple, handlerObj ):
    try:
      return handlerObj._rh_executeAction( proposalTuple )
    except Exception as e:
      gLogger.exception( "Exception while executing handler action" )
      return S_ERROR( "Server error while executing action: %s" % str( e ) )

  def _mbReceivedMsg( self, trid, msgObj ):
    result = self._authorizeProposal( ( 'Message', msgObj.getName() ),
                                      trid,
                                      self._transportPool.get( trid ).getConnectingCredentials() )
    if not result[ 'OK' ]:
      return result
    result = self._instantiateHandler( trid )
    if not result[ 'OK' ]:
      return result
    handlerObj = result[ 'Value' ]
    return handlerObj._rh_executeMessageCallback( msgObj )

  def _mbDisconnect( self, trid ):
    result = self._instantiateHandler( trid )
    if not result[ 'OK' ]:
      return result
    handlerObj = result[ 'Value' ]
    return handlerObj._rh_executeConnectionCallback( 'drop' )


  def __startReportToMonitoring( self ):
    self._monitor.addMark( "Queries" )
    now = time.time()
    stats = os.times()
    cpuTime = stats[0] + stats[2]
    if now - self.__monitorLastStatsUpdate < 0:
      return ( now, cpuTime )
    # Send CPU consumption mark
    wallClock = now - self.__monitorLastStatsUpdate
    self.__monitorLastStatsUpdate = now
    # Send Memory consumption mark
    membytes = MemStat.VmB( 'VmRSS:' )
    if membytes:
      mem = membytes / ( 1024. * 1024. )
      self._monitor.addMark( 'MEM', mem )
    return ( now, cpuTime )

  def __endReportToMonitoring( self, initialWallTime, initialCPUTime ):
    wallTime = time.time() - initialWallTime
    stats = os.times()
    cpuTime = stats[0] + stats[2] - initialCPUTime
    percentage = cpuTime / wallTime * 100.
    if percentage > 0:
      self._monitor.addMark( 'CPU', percentage )
コード例 #9
0
ファイル: FTSAgent.py プロジェクト: alexanderrichards/DIRAC
class FTSAgent( AgentModule ):
  """
  .. class:: FTSAgent

  Agent propagating Scheduled request to Done or Failed state in the FTS system.

  Requests and associated FTSJobs (and so FTSFiles) are kept in cache.

  """
  # # fts graph refresh in seconds
  FTSGRAPH_REFRESH = FTSHistoryView.INTERVAL / 2
  # # SE R/W access refresh in seconds
  RW_REFRESH = 600
  # # placeholder for max job per channel
  MAX_ACTIVE_JOBS = 50
  # # min threads
  MIN_THREADS = 1
  # # max threads
  MAX_THREADS = 10
  # # files per job
  MAX_FILES_PER_JOB = 100
  # # MAX FTS transfer per FTSFile
  MAX_ATTEMPT = 256
  # # stage flag
  STAGE_FILES = False

  # # placeholder for FTS client
  __ftsClient = None
  # # placeholder for request client
  __requestClient = None
  # # placeholder for resources helper
  __resources = None
  # # placeholder for RSS client
  __rssClient = None
  # # placeholder for FTSGraph
  __ftsGraph = None
  # # graph regeneration time delta
  __ftsGraphValidStamp = None
  # # r/w access valid stamp
  __rwAccessValidStamp = None
  # # placeholder for threadPool
  __threadPool = None
  # # update lock
  __updateLock = None
  # # se cache
  __seCache = dict()
  # # request cache
  __reqCache = dict()

  def updateLock( self ):
    """ update lock """
    if not self.__updateLock:
      self.__updateLock = LockRing().getLock( "FTSAgentLock" )
    return self.__updateLock

  @classmethod
  def requestClient( cls ):
    """ request client getter """
    if not cls.__requestClient:
      cls.__requestClient = ReqClient()
    return cls.__requestClient

  @classmethod
  def ftsClient( cls ):
    """ FTS client """
    if not cls.__ftsClient:
      cls.__ftsClient = FTSClient()
    return cls.__ftsClient



  @classmethod
  def rssClient( cls ):
    """ RSS client getter """
    if not cls.__rssClient:
      cls.__rssClient = ResourceStatus()
    return cls.__rssClient

  @classmethod
  def getSE( cls, seName ):
    """ keep SEs in cache """
    if seName not in cls.__seCache:
      cls.__seCache[seName] = StorageElement( seName )
    return cls.__seCache[seName]

  @classmethod
  def getSECache( cls ):
    return cls.__seCache

  @classmethod
  def getRequest( cls, reqName ):
    """ get Requests systematically and refresh cache """
    getRequest = cls.requestClient().getRequest( reqName )
    if not getRequest["OK"]:
      cls.__reqCache.pop( reqName, None )
      return getRequest
    getRequest = getRequest["Value"]
    if not getRequest:
      cls.__reqCache.pop( reqName, None )
      return S_ERROR( "request of name '%s' not found in ReqDB" % reqName )
    cls.__reqCache[reqName] = getRequest

    return S_OK( cls.__reqCache[reqName] )

  @classmethod
  def putRequest( cls, request, clearCache = True ):
    """ put request back to ReqDB

    :param Request request: Request instance
    :param bool clearCache: clear the cache?

    also finalize request if status == Done
    """
    # # put back request
    if request.RequestName not in cls.__reqCache:
      return S_OK()
    put = cls.requestClient().putRequest( request )
    if not put["OK"]:
      return put
    # # finalize first if possible
    if request.Status == "Done" and request.JobID:
      finalizeRequest = cls.requestClient().finalizeRequest( request.RequestName, request.JobID )
      if not finalizeRequest["OK"]:
        request.Status = "Scheduled"
    # # del request from cache if needed
    if clearCache:
      cls.__reqCache.pop( request.RequestName, None )
    return S_OK()

  @classmethod
  def putFTSJobs( cls, ftsJobsList ):
    """ put back fts jobs to the FTSDB """
    for ftsJob in ftsJobsList:
      put = cls.ftsClient().putFTSJob( ftsJob )
      if not put["OK"]:
        return put
    return S_OK()

  @staticmethod
  def updateFTSFileDict( ftsFilesDict, toUpdateDict ):
    """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """
    for category, ftsFileList in ftsFilesDict.items():
      for ftsFile in toUpdateDict.get( category, [] ):
        if ftsFile not in ftsFileList:
          ftsFileList.append( ftsFile )
    return ftsFilesDict

#  def resources( self ):
#    """ resource helper getter """
#    if not self.__resources:
#      self.__resources = Resources()
#    return self.__resources

  def threadPool( self ):
    """ thread pool getter """
    if not self.__threadPool:
      self.__threadPool = ThreadPool( self.MIN_THREADS, self.MAX_THREADS )
      self.__threadPool.daemonize()
    return self.__threadPool

  def resetFTSGraph( self ):
    """ create fts graph """
    log = gLogger.getSubLogger( "ftsGraph" )

    ftsHistory = self.ftsClient().getFTSHistory()
    if not ftsHistory["OK"]:
      log.error( "unable to get FTS history: %s" % ftsHistory["Message"] )
      return ftsHistory
    ftsHistory = ftsHistory["Value"]

    try:
      self.updateLock().acquire()
      self.__ftsGraph = FTSGraph( "FTSGraph", ftsHistory )
    finally:
      self.updateLock().release()

    log.debug( "FTSSites: %s" % len( self.__ftsGraph.nodes() ) )
    for i, site in enumerate( self.__ftsGraph.nodes() ):
      log.debug( " [%02d] FTSSite: %-25s FTSServer: %s" % ( i, site.name, site.FTSServer ) )
    log.debug( "FTSRoutes: %s" % len( self.__ftsGraph.edges() ) )
    for i, route in enumerate( self.__ftsGraph.edges() ):
      log.debug( " [%02d] FTSRoute: %-25s Active FTSJobs (Max) = %s (%s)" % ( i,
                                                                             route.routeName,
                                                                             route.ActiveJobs,
                                                                             route.toNode.MaxActiveJobs ) )
    # # save graph stamp
    self.__ftsGraphValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH )

    # # refresh SE R/W access
    try:
      self.updateLock().acquire()
      self.__ftsGraph.updateRWAccess()
    finally:
      self.updateLock().release()
    # # save rw access stamp
    self.__rwAccessValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.RW_REFRESH )

    return S_OK()

  def initialize( self ):
    """ agent's initialization """


      # # data manager
    self.dataManager = DataManager()

    log = self.log.getSubLogger( "initialize" )

    self.FTSGRAPH_REFRESH = self.am_getOption( "FTSGraphValidityPeriod", self.FTSGRAPH_REFRESH )
    log.info( "FTSGraph validity period       = %s s" % self.FTSGRAPH_REFRESH )
    self.RW_REFRESH = self.am_getOption( "RWAccessValidityPeriod", self.RW_REFRESH )
    log.info( "SEs R/W access validity period = %s s" % self.RW_REFRESH )

    self.STAGE_FILES = self.am_getOption( "StageFiles", self.STAGE_FILES )
    log.info( "Stage files before submission  = %s" % {True: "yes", False: "no"}[bool( self.STAGE_FILES )] )

    self.MAX_ACTIVE_JOBS = self.am_getOption( "MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS )
    log.info( "Max active FTSJobs/route       = %s" % self.MAX_ACTIVE_JOBS )
    self.MAX_FILES_PER_JOB = self.am_getOption( "MaxFilesPerJob", self.MAX_FILES_PER_JOB )
    log.info( "Max FTSFiles/FTSJob            = %d" % self.MAX_FILES_PER_JOB )

    self.MAX_ATTEMPT = self.am_getOption( "MaxTransferAttempts", self.MAX_ATTEMPT )
    log.info( "Max transfer attempts          = %s" % self.MAX_ATTEMPT )

    # # thread pool
    self.MIN_THREADS = self.am_getOption( "MinThreads", self.MIN_THREADS )
    self.MAX_THREADS = self.am_getOption( "MaxThreads", self.MAX_THREADS )
    minmax = ( abs( self.MIN_THREADS ), abs( self.MAX_THREADS ) )
    self.MIN_THREADS, self.MAX_THREADS = min( minmax ), max( minmax )
    log.info( "ThreadPool min threads         = %s" % self.MIN_THREADS )
    log.info( "ThreadPool max threads         = %s" % self.MAX_THREADS )

    log.info( "initialize: creation of FTSGraph..." )
    createGraph = self.resetFTSGraph()
    if not createGraph["OK"]:
      log.error( "initialize: %s" % createGraph["Message"] )
      return createGraph

    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )
    log.info( "will use DataManager proxy" )

    # # gMonitor stuff here
    gMonitor.registerActivity( "RequestsAtt", "Attempted requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RequestsOK", "Successful requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RequestsFail", "Failed requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSJobsSubAtt", "FTSJobs creation attempts",
                               "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsSubOK", "FTSJobs submitted successfully",
                               "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsSubFail", "FTSJobs submissions failed",
                               "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSJobsMonAtt", "FTSJobs monitored",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsMonOK", "FTSJobs monitored successfully",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsMonFail", "FTSJobs attempts failed",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSMonitorFail", "Failed FTS monitor executions",
                               "FTSAgent", "Execution/mins", gMonitor.OP_SUM )


    pollingTime = self.am_getOption( "PollingTime", 60 )
    for status in list( FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES ):
      gMonitor.registerActivity( "FTSJobs%s" % status, "FTSJobs %s" % status ,
                                 "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime )

    gMonitor.registerActivity( "FtSJobsPerRequest", "Average FTSJobs per request",
                               "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN )
    gMonitor.registerActivity( "FTSFilesPerJob", "FTSFiles per FTSJob",
                               "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN )
    gMonitor.registerActivity( "FTSSizePerJob", "Average FTSFiles size per FTSJob",
                               "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN )
    return S_OK()

  def finalize( self ):
    """ finalize processing """
    # log = self.log.getSubLogger( "finalize" )
    # if self.__reqCache:
    #  log.info( 'putting back %d requests from cache' % len( self.__reqCache ) )
    # else:
    #  log.info( 'no requests to put back' )
    # for request in self.__reqCache.values():
    #  put = self.requestClient().putRequest( request )
    #  if not put["OK"]:
    #    log.error( "unable to put back request '%s': %s" % ( request.RequestName, put["Message"] ) )
    return S_OK()

  def execute( self ):
    """ one cycle execution """
    log = gLogger.getSubLogger( "execute" )
    # # reset FTSGraph if expired
    now = datetime.datetime.now()
    if now > self.__ftsGraphValidStamp:
      log.info( "resetting expired FTS graph..." )
      resetFTSGraph = self.resetFTSGraph()
      if not resetFTSGraph["OK"]:
        log.error( "FTSGraph recreation error: %s" % resetFTSGraph["Message"] )
        return resetFTSGraph
      self.__ftsGraphValidStamp = now + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH )
    # # update R/W access in FTSGraph if expired
    if now > self.__rwAccessValidStamp:
      log.info( "updating expired R/W access for SEs..." )
      try:
        self.updateLock().acquire()
        self.__ftsGraph.updateRWAccess()
      finally:
        self.updateLock().release()
        self.__rwAccessValidStamp = now + datetime.timedelta( seconds = self.RW_REFRESH )

    requestNames = self.requestClient().getRequestNamesList( [ "Scheduled" ] )
    if not requestNames["OK"]:
      log.error( "unable to read scheduled request names: %s" % requestNames["Message"] )
      return requestNames
    if not requestNames["Value"]:
      requestNames = self.__reqCache.keys()
    else:
      requestNames = [ req[0] for req in requestNames["Value"] ]
      requestNames = list( set ( requestNames + self.__reqCache.keys() ) )

    if not requestNames:
      log.info( "no 'Scheduled' requests to process" )
      return S_OK()

    log.info( "found %s requests to process:" % len( requestNames ) )
    log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) )
    log.info( " =>   new read from RMS: %s" % ( len( requestNames ) - len( self.__reqCache ) ) )

    for requestName in requestNames:
      request = self.getRequest( requestName )
      if not request["OK"]:
        log.error( request["Message"] )
        continue
      request = request["Value"]
      sTJId = request.RequestName
      while True:
        queue = self.threadPool().generateJobAndQueueIt( self.processRequest,
                                                         args = ( request, ),
                                                         sTJId = sTJId )
        if queue["OK"]:
          log.info( "request '%s' enqueued for execution" % sTJId )
          gMonitor.addMark( "RequestsAtt", 1 )
          break
        time.sleep( 1 )

    # # process all results
    self.threadPool().processAllResults()
    return S_OK()

  def processRequest( self, request ):
    """ process one request

    :param Request request: ReqDB.Request
    """
    log = self.log.getSubLogger( request.RequestName )

    operation = request.getWaiting()
    if not operation["OK"]:
      log.error( "unable to find 'Scheduled' ReplicateAndRegister operation in request" )
      return self.putRequest( request )
    operation = operation["Value"]
    if not isinstance( operation, Operation ):
      log.error( "waiting returned operation is not an operation: %s" % type( operation ) )
      return self.putRequest( request )
    if operation.Type != "ReplicateAndRegister":
      log.error( "operation to be executed is not a ReplicateAndRegister but %s" % operation.Type )
      return self.putRequest( request )
    if operation.Status != "Scheduled":
      log.error( "operation in a wrong state, expecting 'Scheduled', got %s" % operation.Status )
      return self.putRequest( request )

    log.info( 'start processRequest' )
    # # select  FTSJobs, by default all in TRANS_STATES and INIT_STATES
    ftsJobs = self.ftsClient().getFTSJobsForRequest( request.RequestID )
    if not ftsJobs["OK"]:
      log.error( ftsJobs["Message"] )
      return ftsJobs
    ftsJobs = [ftsJob for ftsJob in ftsJobs.get( "Value", [] ) if ftsJob.Status not in FTSJob.FINALSTATES]

    # # Use a try: finally: for making sure FTS jobs are put back before returnin
    try:
      # # dict keeping info about files to reschedule, submit, fail and register
      ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] )

      if ftsJobs:
        log.info( "==> found %s FTSJobs to monitor" % len( ftsJobs ) )
        # # PHASE 0 = monitor active FTSJobs
        for ftsJob in ftsJobs:
          monitor = self.__monitorJob( request, ftsJob )
          if not monitor["OK"]:
            log.error( "unable to monitor FTSJob %s: %s" % ( ftsJob.FTSJobID, monitor["Message"] ) )
            ftsJob.Status = "Submitted"
          else:
            ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, monitor["Value"] )

        log.info( "monitoring of FTSJobs completed" )
        for key, ftsFiles in ftsFilesDict.items():
          if ftsFiles:
            log.debug( " => %s FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) )

      # # PHASE ONE - check ready replicas
      missingReplicas = self.__checkReadyReplicas( request, operation )
      if not missingReplicas["OK"]:
        log.error( missingReplicas["Message"] )
      else:
        missingReplicas = missingReplicas["Value"]
        for opFile in operation:
          # Actually the condition below should never happen... Change printout for checking
          if opFile.LFN not in missingReplicas and opFile.Status not in ( 'Done', 'Failed' ):
            log.warn( "File should be set Done! %s is replicated at all targets" % opFile.LFN )
            opFile.Status = "Done"

        if missingReplicas:
          # Check if these files are in the FTSDB
          ftsFiles = self.ftsClient().getAllFTSFilesForRequest( request.RequestID )
          if not ftsFiles['OK']:
            log.error( ftsFiles['Message'] )
          else:
            ftsFiles = ftsFiles['Value']
            ftsLfns = set( [ftsFile.LFN for ftsFile in ftsFiles] )
            toSchedule = set( missingReplicas ) - ftsLfns
            if toSchedule:
              log.warn( '%d files in operation are not in FTSDB, reset them Waiting' % len( toSchedule ) )
              for opFile in operation:
                if opFile.LFN in toSchedule and opFile.Status == 'Scheduled':
                  opFile.Status = 'Waiting'
            # identify missing LFNs that are waiting for a replication which is finished
            for ftsFile in [f for f in ftsFiles if f.LFN in missingReplicas and f.Status.startswith( 'Waiting#' )]:
              targetSE = ftsFile.Status.split( '#' )[1]
              finishedFiles = [f for f in ftsFiles if
                               f.LFN == ftsFile.LFN and
                               f.Status == 'Finished' and
                               f.TargetSE == targetSE and
                               f not in ftsFilesDict['toUpdate']]
              if finishedFiles:
                log.warn( "%s is %s while replication was Finished to %s, update" % ( ftsFile.LFN, ftsFile.Status, targetSE ) )
                ftsFilesDict['toUpdate'] += finishedFiles
            # identify Finished transfer for which the replica is still missing
            for ftsFile in [f for f in ftsFiles if f.Status == 'Finished' and f.TargetSE in missingReplicas.get( f.LFN, [] ) and f not in ftsFilesDict['toRegister'] ]:
              # Check if there is a registration operation for that file and that target
              regOp = [op for op in request if
                       op.Type == 'RegisterReplica' and
                       op.TargetSE == ftsFile.TargetSE and
                       [f for f in op if f.LFN == ftsFile.LFN]]
              if not regOp:
                ftsFilesDict['toReschedule'].append( ftsFile )

      toFail = ftsFilesDict.get( "toFail", [] )
      toReschedule = ftsFilesDict.get( "toReschedule", [] )
      toSubmit = ftsFilesDict.get( "toSubmit", [] )
      toRegister = ftsFilesDict.get( "toRegister", [] )
      toUpdate = ftsFilesDict.get( "toUpdate", [] )

      # # PHASE TWO = Failed files? -> make request Failed and return
      if toFail:
        log.error( "==> found %s 'Failed' FTSFiles, but maybe other files can be processed..." % len( toFail ) )
        for opFile in operation:
          for ftsFile in toFail:
            if opFile.FileID == ftsFile.FileID:
              opFile.Error = ftsFile.Error
              opFile.Status = "Failed"
        operation.Error = "%s files are missing any replicas" % len( toFail )
        # # requets.Status should be Failed if all files in the operation "Failed"
        if request.Status == "Failed":
          request.Error = "ReplicateAndRegister %s failed" % operation.Order
          log.error( "request is set to 'Failed'" )
          return self.putRequest( request )

      # # PHASE THREE - update Waiting#TargetSE FTSFiles
      if toUpdate:
        log.info( "==> found %s possible FTSFiles to update..." % ( len( toUpdate ) ) )
        byTarget = {}
        for ftsFile in toUpdate:
          byTarget.setdefault( ftsFile.TargetSE, [] ).append( ftsFile.FileID )
        for targetSE, fileIDList in byTarget.items():
          update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList )
          if not update["OK"]:
            log.error( "update FTSFiles failed: %s" % update["Message"] )

      # # PHASE FOUR - add 'RegisterReplica' Operations
      if toRegister:
        log.info( "==> found %d Files waiting for registration, adding 'RegisterReplica' operations" % len( toRegister ) )
        registerFiles = self.__insertRegisterOperation( request, operation, toRegister )
        if not registerFiles["OK"]:
          log.error( "unable to create 'RegisterReplica' operations: %s" % registerFiles["Message"] )
        # if request.Status == "Waiting":
        #  log.info( "request is in 'Waiting' state, will put it back to RMS" )
        #  return self.putRequest( request )

      # # PHASE FIVE - reschedule operation files
      if toReschedule:
        log.info( "==> found %s Files to reschedule" % len( toReschedule ) )
        rescheduleFiles = self.__reschedule( request, operation, toReschedule )
        if not rescheduleFiles["OK"]:
          log.error( rescheduleFiles["Message"] )

      # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs. We get also Failed files to recover them if needed
      ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, [ "Waiting", "Failed", 'Submitted' ] )
      if not ftsFiles["OK"]:
        log.error( ftsFiles["Message"] )
      else:
        retryIds = set ( [ ftsFile.FTSFileID for ftsFile in toSubmit ] )
        for ftsFile in ftsFiles["Value"]:
          if ftsFile.FTSFileID not in retryIds:
            if ftsFile.Status == 'Failed':
              # If the file was not unrecoverable failed and is not yet set toSubmit
              _reschedule, submit, _fail = self.__checkFailed( ftsFile )
            elif ftsFile.Status == 'Submitted':
              if ftsFile.FTSGUID not in [job.FTSGUID for job in ftsJobs]:
                log.warn( 'FTS GUID %s not found in FTS jobs, resubmit file transfer' % ftsFile.FTSGUID )
                ftsFile.Status = 'Waiting'
                submit = True
              else:
                submit = False
            else:
              submit = True
            if submit:
              toSubmit.append( ftsFile )
              retryIds.add( ftsFile.FTSFileID )

      # # submit new ftsJobs
      if toSubmit:
        if request.Status != 'Scheduled':
          log.info( "Found %d FTSFiles to submit while request is no longer in Scheduled status (%s)" \
                    % ( len( toSubmit ), request.Status ) )
        else:
          self.__checkDuplicates( request.RequestName, toSubmit )
          log.info( "==> found %s FTSFiles to submit" % len( toSubmit ) )
          submit = self.__submit( request, operation, toSubmit )
          if not submit["OK"]:
            log.error( submit["Message"] )
          else:
            ftsJobs += submit["Value"]

      # # status change? - put back request
      if request.Status != "Scheduled":
        log.info( "request no longer in 'Scheduled' state (%s), will put it back to RMS" % request.Status )

    except Exception, exceptMessage:
      log.exception( "Exception in processRequest", exceptMessage )
    finally:
コード例 #10
0
class FTSAgent( AgentModule ):
  """
  .. class:: FTSAgent

  Agent propagating Scheduled request to Done or Failed state in the FTS system.

  Requests and associated FTSJobs (and so FTSFiles) are kept in cache.

  """
  # # fts placement refresh in seconds
  FTSPLACEMENT_REFRESH = FTSHistoryView.INTERVAL / 2
  # # placeholder for max job per channel
  MAX_ACTIVE_JOBS = 50
  # # min threads
  MIN_THREADS = 1
  # # max threads
  MAX_THREADS = 10
  # # files per job
  MAX_FILES_PER_JOB = 100
  # # MAX FTS transfer per FTSFile
  MAX_ATTEMPT = 256
  # # stage flag
  PIN_TIME = 0
  # # FTS submission command
  SUBMIT_COMMAND = 'glite-transfer-submit'
  # # FTS monitoring command
  MONITOR_COMMAND = 'glite-transfer-status'
  # Max number of requests fetched from the RMS
  MAX_REQUESTS = 100
  # Minimum interval (seconds) between 2 job monitoring
  MONITORING_INTERVAL = 600

  # # placeholder for FTS client
  __ftsClient = None
  # # placeholder for the FTS version
  __ftsVersion = None
  # # placeholder for request client
  __requestClient = None
  # # placeholder for resources helper
  __resources = None
  # # placeholder for RSS client
  __rssClient = None
  # # placeholder for FTSPlacement
  __ftsPlacement = None

  # # placement regeneration time delta
  __ftsPlacementValidStamp = None

  # # placeholder for threadPool
  __threadPool = None
  # # update lock
  __updateLock = None
  # # request cache
  __reqCache = dict()

  def updateLock( self ):
    """ update lock """
    if not self.__updateLock:
      self.__updateLock = LockRing().getLock( "FTSAgentLock" )
    return self.__updateLock

  @classmethod
  def requestClient( cls ):
    """ request client getter """
    if not cls.__requestClient:
      cls.__requestClient = ReqClient()
    return cls.__requestClient

  @classmethod
  def ftsClient( cls ):
    """ FTS client """
    if not cls.__ftsClient:
      cls.__ftsClient = FTSClient()
    return cls.__ftsClient



  @classmethod
  def rssClient( cls ):
    """ RSS client getter """
    if not cls.__rssClient:
      cls.__rssClient = ResourceStatus()
    return cls.__rssClient

  @classmethod
  def getRequest( cls, reqID ):
    """ get Requests systematically and refresh cache """

    # Make sure the request is Scheduled
    res = cls.requestClient().getRequestStatus( reqID )
    if not res['OK']:
      cls.__reqCache.pop( reqID, None )
      return res
    status = res['Value']
    if status != 'Scheduled':
      cls.__reqCache.pop( reqID, None )
      return S_ERROR( "Request with id %s is not Scheduled:%s" % ( reqID, status ) )

    getRequest = cls.requestClient().getRequest( reqID )
    if not getRequest["OK"]:
      cls.__reqCache.pop( reqID, None )
      return getRequest
    getRequest = getRequest["Value"]
    if not getRequest:
      cls.__reqCache.pop( reqID, None )
      return S_ERROR( "request of id '%s' not found in ReqDB" % reqID )
    cls.__reqCache[reqID] = getRequest

    return S_OK( cls.__reqCache[reqID] )

  @classmethod
  def putRequest( cls, request, clearCache = True ):
    """ put request back to ReqDB

    :param Request request: Request instance
    :param bool clearCache: clear the cache?

    also finalize request if status == Done
    """
    # # put back request
    if request.RequestID not in cls.__reqCache:
      return S_OK()
    put = cls.requestClient().putRequest( request )
    if not put["OK"]:
      return put
    # # finalize first if possible
    if request.Status == "Done" and request.JobID:
      finalizeRequest = cls.requestClient().finalizeRequest( request.RequestID, request.JobID )
      if not finalizeRequest["OK"]:
        request.Status = "Scheduled"
    # # del request from cache if needed
    if clearCache:
      cls.__reqCache.pop( request.RequestID, None )
    return S_OK()

  @classmethod
  def putFTSJobs( cls, ftsJobsList ):
    """ put back fts jobs to the FTSDB """
    for ftsJob in ftsJobsList:
      put = cls.ftsClient().putFTSJob( ftsJob )
      if not put["OK"]:
        return put
    return S_OK()

  @staticmethod
  def updateFTSFileDict( ftsFilesDict, toUpdateDict ):
    """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """
    for category, ftsFileList in ftsFilesDict.iteritems():
      for ftsFile in toUpdateDict.get( category, [] ):
        if ftsFile not in ftsFileList:
          ftsFileList.append( ftsFile )
    return ftsFilesDict

#  def resources( self ):
#    """ resource helper getter """
#    if not self.__resources:
#      self.__resources = Resources()
#    return self.__resources

  def threadPool( self ):
    """ thread pool getter """
    if not self.__threadPool:
      self.__threadPool = ThreadPool( self.MIN_THREADS, self.MAX_THREADS )
      self.__threadPool.daemonize()
    return self.__threadPool


  def resetFTSPlacement( self ):
    """ create fts Placement """

    ftsHistory = self.ftsClient().getFTSHistory()
    if not ftsHistory["OK"]:
      self.log.error( "unable to get FTS history:", ftsHistory["Message"] )
      return ftsHistory
    ftsHistory = ftsHistory["Value"]

    try:
      self.updateLock().acquire()
      if not self.__ftsPlacement:
        self.__ftsPlacement = FTSPlacement( csPath = None, ftsHistoryViews = ftsHistory )
      else:
        self.__ftsPlacement.refresh( ftsHistoryViews = ftsHistory )
    finally:
      self.updateLock().release()

    # # save time stamp
    self.__ftsPlacementValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH )

    return S_OK()

  def initialize( self ):
    """ agent's initialization """


      # # data manager
    self.dataManager = DataManager()

    log = self.log.getSubLogger( "initialize" )

    self.FTSPLACEMENT_REFRESH = self.am_getOption( "FTSPlacementValidityPeriod", self.FTSPLACEMENT_REFRESH )
    log.info( "FTSPlacement validity period       = %s s" % self.FTSPLACEMENT_REFRESH )


    self.SUBMIT_COMMAND = self.am_getOption( "SubmitCommand", self.SUBMIT_COMMAND )
    log.info( "FTS submit command = %s" % self.SUBMIT_COMMAND )
    self.MONITOR_COMMAND = self.am_getOption( "MonitorCommand", self.MONITOR_COMMAND )
    log.info( "FTS commands: submit = %s monitor %s" % ( self.SUBMIT_COMMAND, self.MONITOR_COMMAND ) )
    self.PIN_TIME = self.am_getOption( "PinTime", self.PIN_TIME )
    log.info( "Stage files before submission  = ", {True: "yes", False: "no"}[bool( self.PIN_TIME )] )

    self.MAX_ACTIVE_JOBS = self.am_getOption( "MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS )
    log.info( "Max active FTSJobs/route       = ", str( self.MAX_ACTIVE_JOBS ) )
    self.MAX_FILES_PER_JOB = self.am_getOption( "MaxFilesPerJob", self.MAX_FILES_PER_JOB )
    log.info( "Max FTSFiles/FTSJob            = ", str( self.MAX_FILES_PER_JOB ) )

    self.MAX_ATTEMPT = self.am_getOption( "MaxTransferAttempts", self.MAX_ATTEMPT )
    log.info( "Max transfer attempts          = ", str( self.MAX_ATTEMPT ) )

    # # thread pool
    self.MIN_THREADS = self.am_getOption( "MinThreads", self.MIN_THREADS )
    self.MAX_THREADS = self.am_getOption( "MaxThreads", self.MAX_THREADS )
    minmax = ( abs( self.MIN_THREADS ), abs( self.MAX_THREADS ) )
    self.MIN_THREADS, self.MAX_THREADS = min( minmax ), max( minmax )
    log.info( "ThreadPool min threads         = ", str( self.MIN_THREADS ) )
    log.info( "ThreadPool max threads         = ", str( self.MAX_THREADS ) )

    self.MAX_REQUESTS = self.am_getOption( "MaxRequests", self.MAX_REQUESTS )
    log.info( "Max Requests fetched           = ", str( self.MAX_REQUESTS ) )

    self.MONITORING_INTERVAL = self.am_getOption( "MonitoringInterval", self.MONITORING_INTERVAL )
    log.info( "Minimum monitoring interval    = ", str( self.MONITORING_INTERVAL ) )

    self.__ftsVersion = Operations().getValue( 'DataManagement/FTSVersion', 'FTS2' )
    log.info( "FTSVersion : %s" % self.__ftsVersion )
    log.info( "initialize: creation of FTSPlacement..." )
    createPlacement = self.resetFTSPlacement()
    if not createPlacement["OK"]:
      log.error( "initialize:", createPlacement["Message"] )
      return createPlacement

    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )
    log.info( "will use DataManager proxy" )

    self.registrationProtocols = DMSHelpers().getRegistrationProtocols()


    # # gMonitor stuff here
    gMonitor.registerActivity( "RequestsAtt", "Attempted requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RequestsOK", "Successful requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RequestsFail", "Failed requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSJobsSubAtt", "FTSJobs creation attempts",
                               "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsSubOK", "FTSJobs submitted successfully",
                               "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsSubFail", "FTSJobs submissions failed",
                               "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSJobsMonAtt", "FTSJobs monitored",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsMonOK", "FTSJobs monitored successfully",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsMonFail", "FTSJobs attempts failed",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSMonitorFail", "Failed FTS monitor executions",
                               "FTSAgent", "Execution/mins", gMonitor.OP_SUM )


    pollingTime = self.am_getOption( "PollingTime", 60 )
    for status in list( FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES ):
      gMonitor.registerActivity( "FTSJobs%s" % status, "FTSJobs %s" % status ,
                                 "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime )

    gMonitor.registerActivity( "FtSJobsPerRequest", "Average FTSJobs per request",
                               "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN )
    gMonitor.registerActivity( "FTSFilesPerJob", "FTSFiles per FTSJob",
                               "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN )
    gMonitor.registerActivity( "FTSSizePerJob", "Average FTSFiles size per FTSJob",
                               "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN )
    return S_OK()

  def finalize( self ):
    """ finalize processing """
    # log = self.log.getSubLogger( "finalize" )
    # if self.__reqCache:
    #  log.info( 'putting back %d requests from cache' % len( self.__reqCache ) )
    # else:
    #  log.info( 'no requests to put back' )
    # for request in self.__reqCache.values():
    #  put = self.requestClient().putRequest( request )
    #  if not put["OK"]:
    #    log.error( "unable to put back request '%s': %s" % ( request.RequestName, put["Message"] ) )
    return S_OK()

  def execute( self ):
    """ one cycle execution """

    # Don't use the server certificate otherwise the DFC wont let us write
    gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' )


    log = gLogger.getSubLogger( "execute" )
    # # reset FTSPlacement if expired
    now = datetime.datetime.now()
    if now > self.__ftsPlacementValidStamp:
      log.info( "resetting expired FTS placement..." )
      resetFTSPlacement = self.resetFTSPlacement()
      if not resetFTSPlacement["OK"]:
        log.error( "FTSPlacement recreation error:" , resetFTSPlacement["Message"] )
        return resetFTSPlacement
      self.__ftsPlacementValidStamp = now + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH )

    requestIDs = self.requestClient().getRequestIDsList( statusList = [ "Scheduled" ], limit = self.MAX_REQUESTS )
    if not requestIDs["OK"]:
      log.error( "unable to read scheduled request ids" , requestIDs["Message"] )
      return requestIDs
    if not requestIDs["Value"]:
      requestIDs = []
    else:
      requestIDs = [ req[0] for req in requestIDs["Value"] if req[0] not in self.__reqCache ]
    requestIDs += self.__reqCache.keys()

    if not requestIDs:
      log.info( "no 'Scheduled' requests to process" )
      return S_OK()

    log.info( "found %s requests to process:" % len( requestIDs ) )
    log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) )
    log.info( " =>   new read from RMS: %s" % ( len( requestIDs ) - len( self.__reqCache ) ) )

    for requestID in requestIDs:
      request = self.getRequest( requestID )
      if not request["OK"]:
        log.error( "Error getting request", "%s: %s" % ( requestID, request["Message"] ) )
        continue
      request = request["Value"]
      sTJId = request.RequestID
      while True:
        queue = self.threadPool().generateJobAndQueueIt( self.processRequest,
                                                         args = ( request, ),
                                                         sTJId = sTJId )
        if queue["OK"]:
          log.info( "Request enqueued for execution", sTJId )
          gMonitor.addMark( "RequestsAtt", 1 )
          break
        time.sleep( 1 )

    # # process all results
    self.threadPool().processAllResults()
    return S_OK()

  def processRequest( self, request ):
    """ process one request

    :param Request request: ReqDB.Request
    """
    log = self.log.getSubLogger( "req_%s/%s" % ( request.RequestID, request.RequestName ) )

    operation = request.getWaiting()
    if not operation["OK"]:
      log.error( "Unable to find 'Scheduled' ReplicateAndRegister operation in request" )
      return self.putRequest( request )
    operation = operation["Value"]
    if not isinstance( operation, Operation ):
      log.error( "Waiting returned operation is not an operation:", type( operation ) )
      return self.putRequest( request )
    if operation.Type != "ReplicateAndRegister":
      log.error( "operation to be executed is not a ReplicateAndRegister but", operation.Type )
      return self.putRequest( request )
    if operation.Status != "Scheduled":
      log.error( "operation in a wrong state, expecting 'Scheduled', got", operation.Status )
      return self.putRequest( request )

    log.info( 'start processRequest' )
    # # select  FTSJobs, by default all in TRANS_STATES and INIT_STATES
    ftsJobs = self.ftsClient().getFTSJobsForRequest( request.RequestID )
    if not ftsJobs["OK"]:
      log.error( ftsJobs["Message"] )
      return ftsJobs
    ftsJobs = [ftsJob for ftsJob in ftsJobs.get( "Value", [] ) if ftsJob.Status not in FTSJob.FINALSTATES]

    # # Use a try: finally: for making sure FTS jobs are put back before returning
    try:
      # # dict keeping info about files to reschedule, submit, fail and register
      ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) )

      now = datetime.datetime.utcnow()
      jobsToMonitor = [job for job in ftsJobs if
                       ( now - job.LastUpdate ).seconds >
                       ( self.MONITORING_INTERVAL * ( 3. if StorageElement( job.SourceSE ).getStatus().get( 'Value', {} ).get( 'TapeSE' ) else 1. ) )
                       ]
      if jobsToMonitor:
        log.info( "==> found %s FTSJobs to monitor" % len( jobsToMonitor ) )
        # # PHASE 0 = monitor active FTSJobs
        for ftsJob in jobsToMonitor:
          monitor = self.__monitorJob( request, ftsJob )
          if not monitor["OK"]:
            log.error( "unable to monitor FTSJob", "%s: %s" % ( ftsJob.FTSJobID, monitor["Message"] ) )
            ftsJob.Status = "Submitted"
          else:
            ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, monitor["Value"] )

        log.info( "monitoring of FTSJobs completed" )
        for key, ftsFiles in ftsFilesDict.iteritems():
          if ftsFiles:
            log.info( " => %d FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) )
      if len( ftsJobs ) != len( jobsToMonitor ):
        log.info( "==> found %d FTSJobs that were monitored recently" % ( len( ftsJobs ) - len( jobsToMonitor ) ) )
        if not jobsToMonitor:
          # Nothing to happen this time, escape
          raise EscapeTryException

      # # PHASE ONE - check ready replicas
      missingReplicas = self.__checkReadyReplicas( request, operation )
      if not missingReplicas["OK"]:
        log.error( missingReplicas["Message"] )
      else:
        missingReplicas = missingReplicas["Value"]
        for opFile in operation:
          # Actually the condition below should never happen... Change printout for checking
          if opFile.LFN not in missingReplicas and opFile.Status not in ( 'Done', 'Failed' ):
            log.warn( "File should be set Done! %s is replicated at all targets" % opFile.LFN )
            opFile.Status = "Done"

        if missingReplicas:
          # Check if these files are in the FTSDB
          ftsFiles = self.ftsClient().getAllFTSFilesForRequest( request.RequestID )
          if not ftsFiles['OK']:
            log.error( ftsFiles['Message'] )
          else:
            ftsFiles = ftsFiles['Value']
            ftsLfns = set( [ftsFile.LFN for ftsFile in ftsFiles] )
            # Recover files not in FTSDB
            toSchedule = set( missingReplicas ) - ftsLfns
            if toSchedule:
              log.warn( '%d files in operation are not in FTSDB, reset them Waiting' % len( toSchedule ) )
              for opFile in operation:
                if opFile.LFN in toSchedule and opFile.Status == 'Scheduled':
                  opFile.Status = 'Waiting'
            # Recover files with target not in FTSDB
            toSchedule = set( [missing for missing, missingSEs in missingReplicas.iteritems()
                              if not [ftsFile for ftsFile in ftsFiles
                                      if ftsFile.LFN == missing and ftsFile.TargetSE in missingSEs]] )
            if toSchedule:
              log.warn( '%d targets in operation are not in FTSDB, reset files Waiting' % len( toSchedule ) )
              for opFile in operation:
                if opFile.LFN in toSchedule and opFile.Status == 'Scheduled':
                  opFile.Status = 'Waiting'
            # identify missing LFNs that are waiting for a replication which is finished
            for ftsFile in [f for f in ftsFiles if f.LFN in missingReplicas and f.Status.startswith( 'Waiting#' )]:
              targetSE = ftsFile.Status.split( '#' )[1]
              finishedFiles = [f for f in ftsFiles if
                               f.LFN == ftsFile.LFN and
                               f.Status == 'Finished' and
                               f.TargetSE == targetSE and
                               f not in ftsFilesDict['toUpdate']]
              if finishedFiles:
                log.warn( "%s is %s while replication was Finished to %s, update" % ( ftsFile.LFN, ftsFile.Status, targetSE ) )
                ftsFilesDict['toUpdate'] += finishedFiles
            # identify Active transfers for which there is no FTS job any longer and reschedule them
            for ftsFile in [f for f in ftsFiles if f.Status == 'Active' and f.TargetSE in missingReplicas.get( f.LFN, [] )]:
              if not [ftsJob for ftsJob in ftsJobs if ftsJob.FTSGUID == ftsFile.FTSGUID]:
                ftsFilesDict['toReschedule'].append( ftsFile )
            # identify Finished transfer for which the replica is still missing
            for ftsFile in [f for f in ftsFiles if f.Status == 'Finished' and f.TargetSE in missingReplicas.get( f.LFN, [] ) and f not in ftsFilesDict['toRegister'] ]:
              # Check if there is a registration operation for that file and that target
              regOp = [op for op in request if
                       op.Type == 'RegisterReplica' and
                       op.TargetSE == ftsFile.TargetSE and
                       [f for f in op if f.LFN == ftsFile.LFN]]
              if not regOp:
                ftsFilesDict['toReschedule'].append( ftsFile )

            # Recover files that are Failed but were not spotted
            for ftsFile in [f for f in ftsFiles if f.Status == 'Failed' and f.TargetSE in missingReplicas.get( f.LFN, [] )]:
              reschedule, submit, fail = self.__checkFailed( ftsFile )
              if fail and ftsFile not in ftsFilesDict['toFail']:
                ftsFilesDict['toFail'].append( ftsFile )
              elif reschedule and ftsFile not in ftsFilesDict['toReschedule']:
                ftsFilesDict['toReschedule'].append( ftsFile )
              elif submit and ftsFile not in ftsFilesDict['toSubmit']:
                ftsFilesDict['toSubmit'].append( ftsFile )

            # If all transfers are finished for unregistered files and there is already a registration operation, set it Done
            ftsLFNs = [f.LFN for f in ftsFiles]
            for lfn in missingReplicas:
              # We make sure here that the file is being processed by FTS
              if lfn in ftsLFNs:
                if not [f for f in ftsFiles if f.LFN == lfn and ( f.Status != 'Finished' or f in ftsFilesDict['toReschedule'] or f in ftsFilesDict['toRegister'] )]:
                  for opFile in operation:
                    if opFile.LFN == lfn:
                      opFile.Status = 'Done'
                      break
              else:
                # Temporary log
                log.warn( "File with missing replica not in FTS files", lfn )
          for key, ftsFiles in ftsFilesDict.iteritems():
            if ftsFiles:
              log.info( " => %d FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) )

      toFail = ftsFilesDict.get( "toFail", [] )
      toReschedule = ftsFilesDict.get( "toReschedule", [] )
      toSubmit = ftsFilesDict.get( "toSubmit", [] )
      toRegister = ftsFilesDict.get( "toRegister", [] )
      toUpdate = ftsFilesDict.get( "toUpdate", [] )

      # # PHASE TWO = Failed files? -> make request Failed and return
      if toFail:
        log.error( "==> found %d 'Failed' FTSFiles, but maybe other files can be processed..." % len( toFail ) )
        for opFile in operation:
          for ftsFile in toFail:
            if opFile.FileID == ftsFile.FileID:
              opFile.Error = ftsFile.Error
              opFile.Status = "Failed"
        operation.Error = "%s files are missing any replicas" % len( toFail )
        # # requets.Status should be Failed if all files in the operation "Failed"
        if request.Status == "Failed":
          request.Error = "ReplicateAndRegister %s failed" % operation.Order
          log.error( "request is set to 'Failed'" )
          # # putRequest is done by the finally: clause... Not good to do it twice
          raise EscapeTryException

      # # PHASE THREE - update Waiting#TargetSE FTSFiles
      if toUpdate:
        log.info( "==> found %s possible FTSFiles to update..." % ( len( toUpdate ) ) )
        byTarget = {}
        for ftsFile in toUpdate:
          byTarget.setdefault( ftsFile.TargetSE, [] ).append( ftsFile.FileID )
        for targetSE, fileIDList in byTarget.iteritems():
          update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList )
          if not update["OK"]:
            log.error( "update FTSFiles failed:", update["Message"] )

      # # PHASE FOUR - add 'RegisterReplica' Operations
      if toRegister:
        log.info( "==> found %d Files waiting for registration, adding 'RegisterReplica' operations" % len( toRegister ) )
        registerFiles = self.__insertRegisterOperation( request, operation, toRegister )
        if not registerFiles["OK"]:
          log.error( "unable to create 'RegisterReplica' operations:", registerFiles["Message"] )
        # if request.Status == "Waiting":
        #  log.info( "request is in 'Waiting' state, will put it back to RMS" )
        #  return self.putRequest( request )

      # # PHASE FIVE - reschedule operation files
      if toReschedule:
        log.info( "==> found %s Files to reschedule" % len( toReschedule ) )
        rescheduleFiles = self.__reschedule( request, operation, toReschedule )
        if not rescheduleFiles["OK"]:
          log.error( 'Failed to reschedule files', rescheduleFiles["Message"] )

      # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs. We get also Failed files to recover them if needed
      ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, [ "Waiting", "Failed", 'Submitted', 'Canceled' ] )
      if not ftsFiles["OK"]:
        log.error( ftsFiles["Message"] )
      else:
        retryIds = set ( [ ftsFile.FTSFileID for ftsFile in toSubmit ] )
        for ftsFile in ftsFiles["Value"]:
          if ftsFile.FTSFileID not in retryIds:
            if ftsFile.Status in ( 'Failed', 'Canceled' ):
              # If the file was not unrecoverable failed and is not yet set toSubmit
              _reschedule, submit, _fail = self.__checkFailed( ftsFile )
            elif ftsFile.Status == 'Submitted':
              if ftsFile.FTSGUID not in [job.FTSGUID for job in ftsJobs]:
                log.warn( 'FTS GUID %s not found in FTS jobs, resubmit file transfer' % ftsFile.FTSGUID )
                ftsFile.Status = 'Waiting'
                submit = True
              else:
                submit = False
            else:
              submit = True
            if submit:
              toSubmit.append( ftsFile )
              retryIds.add( ftsFile.FTSFileID )

      # # should not put back jobs that have not been monitored this time
      ftsJobs = jobsToMonitor
      # # submit new ftsJobs
      if toSubmit:
        if request.Status != 'Scheduled':
          log.info( "Found %d FTSFiles to submit while request is no longer in Scheduled status (%s)" \
                    % ( len( toSubmit ), request.Status ) )
        else:
          self.__checkDuplicates( request.RequestID, toSubmit )
          log.info( "==> found %s FTSFiles to submit" % len( toSubmit ) )
          submit = self.__submit( request, operation, toSubmit )
          if not submit["OK"]:
            log.error( submit["Message"] )
          else:
            ftsJobs += submit["Value"]

      # # status change? - put back request
      if request.Status != "Scheduled":
        log.info( "request no longer in 'Scheduled' state (%s), will put it back to RMS" % request.Status )

    except EscapeTryException:
      # This clause is raised when one wants to return from within the try: clause
      # only put back jobs that were monitored
      ftsJobs = jobsToMonitor
    except Exception as exceptMessage:
      log.exception( "Exception in processRequest", lException = exceptMessage )
    finally:
      putRequest = self.putRequest( request, clearCache = ( request.Status != "Scheduled" ) )
      if not putRequest["OK"]:
        log.error( "unable to put back request:", putRequest["Message"] )
      # #  put back jobs in all cases
      if ftsJobs:
        for ftsJob in list( ftsJobs ):
          if not len( ftsJob ):
            log.warn( 'FTS job empty, removed: %s' % ftsJob.FTSGUID )
            self.ftsClient().deleteFTSJob( ftsJob.FTSJobID )
            ftsJobs.remove( ftsJob )
        putJobs = self.putFTSJobs( ftsJobs )
        if not putJobs["OK"]:
          log.error( "unable to put back FTSJobs:", putJobs["Message"] )
          putRequest = putJobs
    # This is where one returns from after execution of the finally: block
    return putRequest

  def __checkDuplicates( self, reqID, toSubmit ):
    """ Check in a list of FTSFiles whether there are duplicates
    """
    tupleList = []
    log = self.log.getSubLogger( "%s/checkDuplicates" % reqID )
    for ftsFile in list( toSubmit ):
      fTuple = ( ftsFile.LFN, ftsFile.SourceSE, ftsFile.TargetSE )
      if fTuple in tupleList:
        log.warn( "Duplicate file to submit, removed:", ', '.join( fTuple ) )
        toSubmit.remove( ftsFile )
        self.ftsClient().deleteFTSFiles( ftsFile.OperationID, [ftsFile.FileID] )
      else:
        tupleList.append( fTuple )


  def __reschedule( self, request, operation, toReschedule ):
    """ reschedule list of :toReschedule: files in request for operation :operation:

    :param Request request:
    :param Operation operation:
    :param list toReschedule: list of FTSFiles
    """
    log = self.log.getSubLogger( "req_%s/%s/reschedule" % ( request.RequestID, request.RequestName ) )

    ftsFileIDs = [ftsFile.FileID for ftsFile in toReschedule]
    for opFile in operation:
      if opFile.FileID in ftsFileIDs:
        opFile.Status = "Waiting"

    toSchedule = []

    # # filter files
    for opFile in [ opFile for opFile in operation if opFile.Status == "Waiting" ]:

      replicas = self.__filterReplicas( opFile )
      if not replicas["OK"]:
        continue
      replicas = replicas["Value"]
      validReplicas = replicas["Valid"]
      noMetaReplicas = replicas["NoMetadata"]
      noReplicas = replicas["NoReplicas"]
      badReplicas = replicas['Bad']

      if validReplicas:
        validTargets = list( set( operation.targetSEList ) - set( validReplicas ) )
        if not validTargets:
          log.info( "file %s is already present at all targets" % opFile.LFN )
          opFile.Status = "Done"
        else:
          toSchedule.append( ( opFile.toJSON()["Value"], validReplicas, validTargets ) )
      elif noMetaReplicas:
        log.warn( "unable to schedule '%s', couldn't get metadata at %s" % ( opFile.LFN, ','.join( noMetaReplicas ) ) )
      elif noReplicas:
        log.warn( "unable to schedule %s, file doesn't exist at %s" % ( opFile.LFN, ','.join( noReplicas ) ) )
        opFile.Status = 'Failed'
      elif badReplicas:
        log.warn( "unable to schedule %s, all replicas have a bad checksum at %s" % ( opFile.LFN, ','.join( badReplicas ) ) )
        opFile.Status = 'Failed'

    # # do real schedule here
    if toSchedule:
      log.info( "Rescheduling %d files" % len( toReschedule ) )
      ftsSchedule = self.ftsClient().ftsSchedule( request.RequestID,
                                                  operation.OperationID,
                                                  toSchedule )
      if not ftsSchedule["OK"]:
        log.error( "Error scheduling files", ftsSchedule["Message"] )
        return ftsSchedule

      ftsSchedule = ftsSchedule["Value"]
      for opFile in operation:
        fileID = opFile.FileID
        if fileID in ftsSchedule["Successful"]:
          opFile.Status = "Scheduled"
        elif fileID in ftsSchedule["Failed"]:
          opFile.Error = ftsSchedule["Failed"][fileID]
          log.error( "Error scheduling file %s" % opFile.LFN, opFile.Error )

    return S_OK()

  def __submit( self, request, operation, toSubmit ):
    """ create and submit new FTSJobs using list of FTSFiles

    :param Request request: ReqDB.Request instance
    :param list ftsFiles: list of FTSFile instances

    :return: [ FTSJob, FTSJob, ...]
    """
    log = self.log.getSubLogger( "req_%s/%s/submit" % ( request.RequestID, request.RequestName ) )

    bySourceAndTarget = {}
    for ftsFile in toSubmit:
      if ftsFile.SourceSE not in bySourceAndTarget:
        bySourceAndTarget.setdefault( ftsFile.SourceSE, {} )
      if ftsFile.TargetSE not in bySourceAndTarget[ftsFile.SourceSE]:
        bySourceAndTarget[ftsFile.SourceSE].setdefault( ftsFile.TargetSE, [] )
      bySourceAndTarget[ftsFile.SourceSE][ftsFile.TargetSE].append( ftsFile )

    ftsJobs = []

    for source, targetDict in bySourceAndTarget.iteritems():

      for target, ftsFileList in targetDict.iteritems():

        log.info( "found %s files to submit from %s to %s" % ( len( ftsFileList ), source, target ) )

        route = self.__ftsPlacement.findRoute( source, target )
        if not route["OK"]:
          log.error( route["Message"] )
          continue
        route = route["Value"]

        routeValid = self.__ftsPlacement.isRouteValid( route )

        if not routeValid['OK']:
          log.error( "Route invalid : %s" % routeValid['Message'] )
          continue

        sourceSE = StorageElement( source )
        sourceToken = sourceSE.getStorageParameters( protocol = 'srm' )
        if not sourceToken["OK"]:
          log.error( "unable to get sourceSE parameters:", "(%s) %s" % ( source, sourceToken["Message"] ) )
          continue
        seStatus = sourceSE.getStatus()['Value']

        targetSE = StorageElement( target )
        targetToken = targetSE.getStorageParameters( protocol = 'srm' )
        if not targetToken["OK"]:
          log.error( "unable to get targetSE parameters:", "(%s) %s" % ( target, targetToken["Message"] ) )
          continue

        # # create FTSJob
        for fileList in breakListIntoChunks( ftsFileList, self.MAX_FILES_PER_JOB ):
          ftsJob = FTSJob()
          ftsJob.RequestID = request.RequestID
          ftsJob.OperationID = operation.OperationID
          ftsJob.SourceSE = source
          ftsJob.TargetSE = target
          ftsJob.SourceToken = sourceToken["Value"].get( "SpaceToken", "" )
          ftsJob.TargetToken = targetToken["Value"].get( "SpaceToken", "" )
          ftsJob.FTSServer = route.ftsServer

          for ftsFile in fileList:
            ftsFile.Attempt += 1
            ftsFile.Error = ""
            ftsJob.addFile( ftsFile )

          submit = ftsJob.submitFTS( self.__ftsVersion, command = self.SUBMIT_COMMAND, pinTime = self.PIN_TIME if seStatus['TapeSE'] else 0 )
          if not submit["OK"]:
            log.error( "unable to submit FTSJob:", submit["Message"] )
            continue

          log.info( "FTSJob '%s'@'%s' has been submitted" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) )

          # # update statuses for job files
          for ftsFile in ftsJob:
            ftsFile.FTSGUID = ftsJob.FTSGUID
            ftsFile.Status = "Submitted"
            ftsFile.Attempt += 1



          # # update placement route
          try:
            self.updateLock().acquire()
            self.__ftsPlacement.startTransferOnRoute( route )
          finally:
            self.updateLock().release()

          ftsJobs.append( ftsJob )

    log.info( "%s new FTSJobs have been submitted" % len( ftsJobs ) )
    return S_OK( ftsJobs )

  def __monitorJob( self, request, ftsJob ):
    """ execute FTSJob.monitorFTS for a given :ftsJob:
        if ftsJob is in a final state, finalize it

    :param Request request: ReqDB.Request instance
    :param FTSJob ftsJob: FTSDB.FTSJob instance
    """
    log = self.log.getSubLogger( "req_%s/%s/monitor/%s" % ( request.RequestID, request.RequestName, ftsJob.FTSGUID ) )
    log.info( "FTSJob '%s'@'%s'" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) )

    # # this will be returned
    ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) )

    monitor = ftsJob.monitorFTS( self.__ftsVersion , command = self.MONITOR_COMMAND )
    if not monitor["OK"]:
      gMonitor.addMark( "FTSMonitorFail", 1 )
      log.error( monitor["Message"] )
      if "getTransferJobSummary2: Not authorised to query request" in monitor["Message"] or \
         'was not found' in monitor['Message'] or\
         "Not found" in monitor['Message'] or\
         'Unknown transfer state' in monitor['Message']:
        log.error( "FTSJob not known (expired on server?): delete it" )
        for ftsFile in ftsJob:
          ftsFile.Status = "Waiting"
          ftsFilesDict["toSubmit"].append( ftsFile )
        # #  No way further for that job: delete it
        res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID )
        if not res['OK']:
          log.error( "Unable to delete FTSJob", res['Message'] )
        return S_OK( ftsFilesDict )
      return monitor

    monitor = monitor["Value"]
    log.info( "FTSJob Status = %s Completeness = %s%%" % ( ftsJob.Status, ftsJob.Completeness ) )

    # # monitor status change
    gMonitor.addMark( "FTSJobs%s" % ftsJob.Status, 1 )

    if ftsJob.Status in FTSJob.FINALSTATES:
      finalizeFTSJob = self.__finalizeFTSJob( request, ftsJob )
      if not finalizeFTSJob["OK"]:
        if 'Unknown transfer state' in finalizeFTSJob['Message']:
          for ftsFile in ftsJob:
            ftsFile.Status = "Waiting"
            ftsFilesDict["toSubmit"].append( ftsFile )
          # #  No way further for that job: delete it
          res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID )
          if not res['OK']:
            log.error( "Unable to delete FTSJob", res['Message'] )
        else:
          log.error( finalizeFTSJob["Message"] )
          return finalizeFTSJob
      else:
        ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, finalizeFTSJob["Value"] )

    return S_OK( ftsFilesDict )

  def __finalizeFTSJob( self, request, ftsJob ):
    """ finalize FTSJob

    :param Request request: ReqDB.Request instance
    :param FTSJob ftsJob: FTSDB.FTSJob instance
    """
    log = self.log.getSubLogger( "req_%s/%s/monitor/%s/finalize" % ( request.RequestID,
                                                                     request.RequestName,
                                                                     ftsJob.FTSJobID ) )
    log.info( "finalizing FTSJob %s@%s" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) )

    # # this will be returned
    ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) )


    monitor = ftsJob.monitorFTS( self.__ftsVersion, command = self.MONITOR_COMMAND, full = True )
    if not monitor["OK"]:
      log.error( monitor["Message"] )
      return monitor

    # # split FTSFiles to different categories
    processFiles = self.__filterFiles( ftsJob )
    if not processFiles["OK"]:
      log.error( processFiles["Message"] )
      return processFiles
    processFiles = processFiles['Value']
    if processFiles['toRegister']:
      log.error( "Some files could not be registered in FC:", len( processFiles['toRegister'] ) )
    ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, processFiles )

    # # send accounting record for this job
    self.__sendAccounting( ftsJob, request.OwnerDN )

    # # update placement - remove this job from placement
    route = self.__ftsPlacement.findRoute( ftsJob.SourceSE, ftsJob.TargetSE )
    if route["OK"]:
      try:
        self.updateLock().acquire()
        self.__ftsPlacement.finishTransferOnRoute( route['Value'] )
      finally:
        self.updateLock().release()

    log.info( "FTSJob is finalized" )

    return S_OK( ftsFilesDict )

  def __checkFailed( self, ftsFile ):
    reschedule = False
    submit = False
    fail = False
    if ftsFile.Status in ( "Failed", 'Canceled' ):
      if ftsFile.Error == "MissingSource":
        reschedule = True
      else:
        if ftsFile.Attempt < self.MAX_ATTEMPT:
          submit = True
        else:
          fail = True
    return reschedule, submit, fail

  def __filterFiles( self, ftsJob ):
    """ process ftsFiles from finished ftsJob

    :param FTSJob ftsJob: monitored FTSJob instance
    """
    # # lists for different categories
    toUpdate = []
    toReschedule = []
    toRegister = []
    toSubmit = []
    toFail = []

    # # loop over files in fts job
    for ftsFile in ftsJob:
      # # successful files
      if ftsFile.Status == "Finished":
        if ftsFile.Error == "AddCatalogReplicaFailed":
          toRegister.append( ftsFile )
        toUpdate.append( ftsFile )
        continue
      reschedule, submit, fail = self.__checkFailed( ftsFile )
      if reschedule:
        toReschedule.append( ftsFile )
      elif submit:
        toSubmit.append( ftsFile )
      elif fail:
        toFail.append( ftsFile )

    return S_OK( { "toUpdate": toUpdate,
                   "toSubmit": toSubmit,
                   "toRegister": toRegister,
                   "toReschedule": toReschedule,
                   "toFail": toFail } )

  def __insertRegisterOperation( self, request, operation, toRegister ):
    """ add RegisterReplica operation

    :param Request request: request instance
    :param Operation transferOp: 'ReplicateAndRegister' operation for this FTSJob
    :param list toRegister: [ FTSDB.FTSFile, ... ] - files that failed to register
    """
    log = self.log.getSubLogger( "req_%s/%s/registerFiles" % ( request.RequestID, request.RequestName ) )

    byTarget = {}
    for ftsFile in toRegister:
      if ftsFile.TargetSE not in byTarget:
        byTarget.setdefault( ftsFile.TargetSE, [] )
      byTarget[ftsFile.TargetSE].append( ftsFile )
    log.info( "will create %s 'RegisterReplica' operations" % len( byTarget ) )

    for target, ftsFileList in byTarget.iteritems():
      log.info( "creating 'RegisterReplica' operation for targetSE %s with %s files..." % ( target,
                                                                                            len( ftsFileList ) ) )
      registerOperation = Operation()
      registerOperation.Type = "RegisterReplica"
      registerOperation.Status = "Waiting"
      registerOperation.TargetSE = target
      targetSE = StorageElement( target )
      for ftsFile in ftsFileList:
        opFile = File()
        opFile.LFN = ftsFile.LFN
        pfn = returnSingleResult( targetSE.getURL( ftsFile.LFN, protocol = self.registrationProtocols ) )
        if not pfn["OK"]:
          continue
        opFile.PFN = pfn["Value"]
        registerOperation.addFile( opFile )
      request.insertBefore( registerOperation, operation )

    return S_OK()

  @staticmethod
  def __sendAccounting( ftsJob, ownerDN ):
    """ prepare and send DataOperation to AccouringDB """

    dataOp = DataOperation()
    dataOp.setStartTime( fromString( ftsJob.SubmitTime ) )
    dataOp.setEndTime( fromString( ftsJob.LastUpdate ) )

    accountingDict = dict()
    accountingDict["OperationType"] = "ReplicateAndRegister"

    username = getUsernameForDN( ownerDN )
    if not username["OK"]:
      username = ownerDN
    else:
      username = username["Value"]

    accountingDict["User"] = username
    accountingDict["Protocol"] = "FTS3" if 'fts3' in ftsJob.FTSServer.lower() else 'FTS'
    accountingDict['ExecutionSite'] = ftsJob.FTSServer

    accountingDict['RegistrationTime'] = ftsJob._regTime
    accountingDict['RegistrationOK'] = ftsJob._regSuccess
    accountingDict['RegistrationTotal'] = ftsJob._regTotal

    accountingDict["TransferOK"] = len( [ f for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES ] )
    accountingDict["TransferTotal"] = len( ftsJob )
    accountingDict["TransferSize"] = ftsJob.Size - ftsJob.FailedSize
    accountingDict["FinalStatus"] = ftsJob.Status
    accountingDict["Source"] = ftsJob.SourceSE
    accountingDict["Destination"] = ftsJob.TargetSE

    # dt = ftsJob.LastUpdate - ftsJob.SubmitTime
    # transferTime = dt.days * 86400 + dt.seconds
    # accountingDict["TransferTime"] = transferTime
    accountingDict['TransferTime'] = sum( int( f._duration ) for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES )
    dataOp.setValuesFromDict( accountingDict )
    dataOp.commit()

  def __checkReadyReplicas( self, request, operation ):
    """ check ready replicas for transferOperation """
    log = self.log.getSubLogger( "req_%s/%s/checkReadyReplicas" % ( request.RequestID, request.RequestName ) )

    targetSESet = set( operation.targetSEList )

    # # { LFN: [ targetSE, ... ] }
    missingReplicas = {}

    scheduledFiles = dict( ( opFile.LFN, opFile ) for opFile in operation if opFile.Status in ( "Scheduled", "Waiting" ) )
    # # get replicas
    replicas = FileCatalog().getReplicas( scheduledFiles.keys() )
    if not replicas["OK"]:
      self.log.error( replicas["Message"] )
      return replicas
    replicas = replicas["Value"]

    fullyReplicated = 0
    missingSEs = {}
    for successfulLFN in replicas["Successful"]:
      reps = set( replicas['Successful'][successfulLFN] )
      if targetSESet.issubset( reps ):
        log.verbose( "%s has been replicated to all targets" % successfulLFN )
        fullyReplicated += 1
        scheduledFiles[successfulLFN].Status = "Done"
      else:
        missingReplicas[successfulLFN] = sorted( targetSESet - reps )
        ses = ",".join( missingReplicas[ successfulLFN ] )
        missingSEs[ses] = missingSEs.setdefault( ses, 0 ) + 1
        log.verbose( "%s is still missing at %s" % ( successfulLFN, ses ) )
    if fullyReplicated:
      log.info( "%d new files have been replicated to all targets" % fullyReplicated )
    if missingSEs:
      for ses in missingSEs:
        log.info( "%d replicas still missing at %s" % ( missingSEs[ses], ses ) )

    reMissing = re.compile( "no such file or directory" )
    for failedLFN, errStr in replicas["Failed"].iteritems():
      scheduledFiles[failedLFN].Error = errStr
      if reMissing.search( errStr.lower() ):
        log.error( "%s is missing, setting its status to 'Failed'" % failedLFN )
        scheduledFiles[failedLFN].Status = "Failed"
      else:
        log.warn( "unable to read replicas for %s: %s" % ( failedLFN, errStr ) )

    return S_OK( missingReplicas )

  def __filterReplicas( self, opFile ):
    """ filter out banned/invalid source SEs """
    from DIRAC.DataManagementSystem.Agent.RequestOperations.ReplicateAndRegister import filterReplicas
    return filterReplicas( opFile, logger = self.log, dataManager = self.dataManager )
コード例 #11
0
class FTSAgent( AgentModule ):
  """
  .. class:: FTSAgent

  Agent propagating Scheduled request to Done or Failed state in the FTS system.

  Requests and associated FTSJobs (and so FTSFiles) are kept in cache.

  """
  # # fts graph refresh in seconds
  FTSGRAPH_REFRESH = FTSHistoryView.INTERVAL / 2
  # # SE R/W access refresh in seconds
  RW_REFRESH = 600
  # # placeholder for max job per channel
  MAX_ACTIVE_JOBS = 50
  # # min threads
  MIN_THREADS = 1
  # # max threads
  MAX_THREADS = 10
  # # files per job
  MAX_FILES_PER_JOB = 100
  # # MAX FTS transfer per FTSFile
  MAX_ATTEMPT = 256
  # # stage flag
  PIN_TIME = 0
  # # FTS submission command
  SUBMIT_COMMAND = 'glite-transfer-submit'
  # # FTS monitoring command
  MONITOR_COMMAND = 'glite-transfer-status'

  # # placeholder for FTS client
  __ftsClient = None
  # # placeholder for request client
  __requestClient = None
  # # placeholder for resources helper
  __resources = None
  # # placeholder for RSS client
  __rssClient = None
  # # placeholder for FTSGraph
  __ftsGraph = None
  # # graph regeneration time delta
  __ftsGraphValidStamp = None
  # # r/w access valid stamp
  __rwAccessValidStamp = None
  # # placeholder for threadPool
  __threadPool = None
  # # update lock
  __updateLock = None
  # # se cache
  __seCache = dict()
  # # request cache
  __reqCache = dict()

  def updateLock( self ):
    """ update lock """
    if not self.__updateLock:
      self.__updateLock = LockRing().getLock( "FTSAgentLock" )
    return self.__updateLock

  @classmethod
  def requestClient( cls ):
    """ request client getter """
    if not cls.__requestClient:
      cls.__requestClient = ReqClient()
    return cls.__requestClient

  @classmethod
  def ftsClient( cls ):
    """ FTS client """
    if not cls.__ftsClient:
      cls.__ftsClient = FTSClient()
    return cls.__ftsClient



  @classmethod
  def rssClient( cls ):
    """ RSS client getter """
    if not cls.__rssClient:
      cls.__rssClient = ResourceStatus()
    return cls.__rssClient

  @classmethod
  def getSE( cls, seName ):
    """ keep SEs in cache """
    if seName not in cls.__seCache:
      cls.__seCache[seName] = StorageElement( seName )
    return cls.__seCache[seName]

  @classmethod
  def getSECache( cls ):
    return cls.__seCache

  @classmethod
  def getRequest( cls, reqName ):
    """ get Requests systematically and refresh cache """
    getRequest = cls.requestClient().getRequest( reqName )
    if not getRequest["OK"]:
      cls.__reqCache.pop( reqName, None )
      return getRequest
    getRequest = getRequest["Value"]
    if not getRequest:
      cls.__reqCache.pop( reqName, None )
      return S_ERROR( "request of name '%s' not found in ReqDB" % reqName )
    cls.__reqCache[reqName] = getRequest

    return S_OK( cls.__reqCache[reqName] )

  @classmethod
  def putRequest( cls, request, clearCache = True ):
    """ put request back to ReqDB

    :param Request request: Request instance
    :param bool clearCache: clear the cache?

    also finalize request if status == Done
    """
    # # put back request
    if request.RequestName not in cls.__reqCache:
      return S_OK()
    put = cls.requestClient().putRequest( request )
    if not put["OK"]:
      return put
    # # finalize first if possible
    if request.Status == "Done" and request.JobID:
      finalizeRequest = cls.requestClient().finalizeRequest( request.RequestName, request.JobID )
      if not finalizeRequest["OK"]:
        request.Status = "Scheduled"
    # # del request from cache if needed
    if clearCache:
      cls.__reqCache.pop( request.RequestName, None )
    return S_OK()

  @classmethod
  def putFTSJobs( cls, ftsJobsList ):
    """ put back fts jobs to the FTSDB """
    for ftsJob in ftsJobsList:
      put = cls.ftsClient().putFTSJob( ftsJob )
      if not put["OK"]:
        return put
    return S_OK()

  @staticmethod
  def updateFTSFileDict( ftsFilesDict, toUpdateDict ):
    """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """
    for category, ftsFileList in ftsFilesDict.items():
      for ftsFile in toUpdateDict.get( category, [] ):
        if ftsFile not in ftsFileList:
          ftsFileList.append( ftsFile )
    return ftsFilesDict

#  def resources( self ):
#    """ resource helper getter """
#    if not self.__resources:
#      self.__resources = Resources()
#    return self.__resources

  def threadPool( self ):
    """ thread pool getter """
    if not self.__threadPool:
      self.__threadPool = ThreadPool( self.MIN_THREADS, self.MAX_THREADS )
      self.__threadPool.daemonize()
    return self.__threadPool

  def resetFTSGraph( self ):
    """ create fts graph """
    log = gLogger.getSubLogger( "ftsGraph" )

    ftsHistory = self.ftsClient().getFTSHistory()
    if not ftsHistory["OK"]:
      log.error( "unable to get FTS history:", ftsHistory["Message"] )
      return ftsHistory
    ftsHistory = ftsHistory["Value"]

    try:
      self.updateLock().acquire()
      self.__ftsGraph = FTSGraph( "FTSGraph", ftsHistory, maxActiveJobs = self.MAX_ACTIVE_JOBS )
    finally:
      self.updateLock().release()

    log.debug( "FTSSites:", len( self.__ftsGraph.nodes() ) )
    for i, site in enumerate( self.__ftsGraph.nodes() ):
      log.debug( " [%02d] FTSSite: %-25s FTSServer: %s" % ( i, site.name, site.FTSServer ) )
    log.debug( "FTSRoutes: %s" % len( self.__ftsGraph.edges() ) )
    for i, route in enumerate( self.__ftsGraph.edges() ):
      log.debug( " [%02d] FTSRoute: %-25s Active FTSJobs (Max) = %s (%s)" % ( i,
                                                                             route.routeName,
                                                                             route.ActiveJobs,
                                                                             route.toNode.MaxActiveJobs ) )
    # # save graph stamp
    self.__ftsGraphValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH )

    # # refresh SE R/W access
    try:
      self.updateLock().acquire()
      self.__ftsGraph.updateRWAccess()
    finally:
      self.updateLock().release()
    # # save rw access stamp
    self.__rwAccessValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.RW_REFRESH )

    return S_OK()

  def initialize( self ):
    """ agent's initialization """


      # # data manager
    self.dataManager = DataManager()

    log = self.log.getSubLogger( "initialize" )

    self.FTSGRAPH_REFRESH = self.am_getOption( "FTSGraphValidityPeriod", self.FTSGRAPH_REFRESH )
    log.info( "FTSGraph validity period       = %s s" % self.FTSGRAPH_REFRESH )
    self.RW_REFRESH = self.am_getOption( "RWAccessValidityPeriod", self.RW_REFRESH )
    log.info( "SEs R/W access validity period = %s s" % self.RW_REFRESH )

    self.SUBMIT_COMMAND = self.am_getOption( "SubmitCommand", self.SUBMIT_COMMAND )
    log.info( "FTS submit command = %s" % self.SUBMIT_COMMAND )
    self.MONITOR_COMMAND = self.am_getOption( "MonitorCommand", self.MONITOR_COMMAND )
    log.info( "FTS commands: submit = %s monitor %s" % ( self.SUBMIT_COMMAND, self.MONITOR_COMMAND ) )
    self.PIN_TIME = self.am_getOption( "PinTime", self.PIN_TIME )
    log.info( "Stage files before submission  = ", {True: "yes", False: "no"}[bool( self.PIN_TIME )] )

    self.MAX_ACTIVE_JOBS = self.am_getOption( "MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS )
    log.info( "Max active FTSJobs/route       = ", str( self.MAX_ACTIVE_JOBS ) )
    self.MAX_FILES_PER_JOB = self.am_getOption( "MaxFilesPerJob", self.MAX_FILES_PER_JOB )
    log.info( "Max FTSFiles/FTSJob            = ", str( self.MAX_FILES_PER_JOB ) )

    self.MAX_ATTEMPT = self.am_getOption( "MaxTransferAttempts", self.MAX_ATTEMPT )
    log.info( "Max transfer attempts          = ", str( self.MAX_ATTEMPT ) )

    # # thread pool
    self.MIN_THREADS = self.am_getOption( "MinThreads", self.MIN_THREADS )
    self.MAX_THREADS = self.am_getOption( "MaxThreads", self.MAX_THREADS )
    minmax = ( abs( self.MIN_THREADS ), abs( self.MAX_THREADS ) )
    self.MIN_THREADS, self.MAX_THREADS = min( minmax ), max( minmax )
    log.info( "ThreadPool min threads         = ", str( self.MIN_THREADS ) )
    log.info( "ThreadPool max threads         = ", str( self.MAX_THREADS ) )

    log.info( "initialize: creation of FTSGraph..." )
    createGraph = self.resetFTSGraph()
    if not createGraph["OK"]:
      log.error( "initialize: ", createGraph["Message"] )
      return createGraph

    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )
    log.info( "will use DataManager proxy" )

    # # gMonitor stuff here
    gMonitor.registerActivity( "RequestsAtt", "Attempted requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RequestsOK", "Successful requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RequestsFail", "Failed requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSJobsSubAtt", "FTSJobs creation attempts",
                               "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsSubOK", "FTSJobs submitted successfully",
                               "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsSubFail", "FTSJobs submissions failed",
                               "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSJobsMonAtt", "FTSJobs monitored",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsMonOK", "FTSJobs monitored successfully",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsMonFail", "FTSJobs attempts failed",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSMonitorFail", "Failed FTS monitor executions",
                               "FTSAgent", "Execution/mins", gMonitor.OP_SUM )


    pollingTime = self.am_getOption( "PollingTime", 60 )
    for status in list( FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES ):
      gMonitor.registerActivity( "FTSJobs%s" % status, "FTSJobs %s" % status ,
                                 "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime )

    gMonitor.registerActivity( "FtSJobsPerRequest", "Average FTSJobs per request",
                               "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN )
    gMonitor.registerActivity( "FTSFilesPerJob", "FTSFiles per FTSJob",
                               "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN )
    gMonitor.registerActivity( "FTSSizePerJob", "Average FTSFiles size per FTSJob",
                               "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN )
    return S_OK()

  def finalize( self ):
    """ finalize processing """
    # log = self.log.getSubLogger( "finalize" )
    # if self.__reqCache:
    #  log.info( 'putting back %d requests from cache' % len( self.__reqCache ) )
    # else:
    #  log.info( 'no requests to put back' )
    # for request in self.__reqCache.values():
    #  put = self.requestClient().putRequest( request )
    #  if not put["OK"]:
    #    log.error( "unable to put back request '%s': %s" % ( request.RequestName, put["Message"] ) )
    return S_OK()

  def execute( self ):
    """ one cycle execution """
    log = gLogger.getSubLogger( "execute" )
    # # reset FTSGraph if expired
    now = datetime.datetime.now()
    if now > self.__ftsGraphValidStamp:
      log.info( "resetting expired FTS graph..." )
      resetFTSGraph = self.resetFTSGraph()
      if not resetFTSGraph["OK"]:
        log.error( "FTSGraph recreation error: ", resetFTSGraph["Message"] )
        return resetFTSGraph
      self.__ftsGraphValidStamp = now + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH )
    # # update R/W access in FTSGraph if expired
    if now > self.__rwAccessValidStamp:
      log.info( "updating expired R/W access for SEs..." )
      try:
        self.updateLock().acquire()
        self.__ftsGraph.updateRWAccess()
      finally:
        self.updateLock().release()
        self.__rwAccessValidStamp = now + datetime.timedelta( seconds = self.RW_REFRESH )

    requestNames = self.requestClient().getRequestNamesList( [ "Scheduled" ] )
    if not requestNames["OK"]:
      log.error( "Unable to read scheduled request names: ", requestNames["Message"] )
      return requestNames
    if not requestNames["Value"]:
      requestNames = self.__reqCache.keys()
    else:
      requestNames = [ req[0] for req in requestNames["Value"] ]
      requestNames = list( set ( requestNames + self.__reqCache.keys() ) )

    if not requestNames:
      log.info( "No 'Scheduled' requests to process" )
      return S_OK()

    log.info( "Found requests to process:", str( len( requestNames ) ) )
    log.info( " => from internal cache:", str( ( len( self.__reqCache ) ) ) )
    log.info( " =>   new read from RMS:", str( ( len( requestNames ) - len( self.__reqCache ) ) ) )

    for requestName in requestNames:
      request = self.getRequest( requestName )
      if not request["OK"]:
        log.error( "Error getting request", "%s: %s" % ( requestName, request["Message"] ) )
        continue
      request = request["Value"]
      sTJId = request.RequestName
      while True:
        queue = self.threadPool().generateJobAndQueueIt( self.processRequest,
                                                         args = ( request, ),
                                                         sTJId = sTJId )
        if queue["OK"]:
          log.info( "Request enqueued for execution", sTJId )
          gMonitor.addMark( "RequestsAtt", 1 )
          break
        time.sleep( 1 )

    # # process all results
    self.threadPool().processAllResults()
    return S_OK()

  def processRequest( self, request ):
    """ process one request

    :param Request request: ReqDB.Request
    """
    log = self.log.getSubLogger( request.RequestName )

    operation = request.getWaiting()
    if not operation["OK"]:
      log.error( "Unable to find 'Scheduled' ReplicateAndRegister operation in request" )
      return self.putRequest( request )
    operation = operation["Value"]
    if not isinstance( operation, Operation ):
      log.error( "Waiting returned operation is not an operation:", type( operation ) )
      return self.putRequest( request )
    if operation.Type != "ReplicateAndRegister":
      log.error( "operation to be executed is not a ReplicateAndRegister but", operation.Type )
      return self.putRequest( request )
    if operation.Status != "Scheduled":
      log.error( "operation in a wrong state, expecting 'Scheduled', got", operation.Status )
      return self.putRequest( request )

    log.info( 'start processRequest' )
    # # select  FTSJobs, by default all in TRANS_STATES and INIT_STATES
    ftsJobs = self.ftsClient().getFTSJobsForRequest( request.RequestID )
    if not ftsJobs["OK"]:
      log.error( ftsJobs["Message"] )
      return ftsJobs
    ftsJobs = [ftsJob for ftsJob in ftsJobs.get( "Value", [] ) if ftsJob.Status not in FTSJob.FINALSTATES]

    # # Use a try: finally: for making sure FTS jobs are put back before returnin
    try:
      # # dict keeping info about files to reschedule, submit, fail and register
      ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] )

      if ftsJobs:
        log.info( "==> found %s FTSJobs to monitor" % len( ftsJobs ) )
        # # PHASE 0 = monitor active FTSJobs
        for ftsJob in ftsJobs:
          monitor = self.__monitorJob( request, ftsJob )
          if not monitor["OK"]:
            log.error( "unable to monitor FTSJob %s: %s" % ( ftsJob.FTSJobID, monitor["Message"] ) )
            ftsJob.Status = "Submitted"
          else:
            ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, monitor["Value"] )

        log.info( "monitoring of FTSJobs completed" )
        for key, ftsFiles in ftsFilesDict.items():
          if ftsFiles:
            log.debug( " => %s FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) )

      # # PHASE ONE - check ready replicas
      missingReplicas = self.__checkReadyReplicas( request, operation )
      if not missingReplicas["OK"]:
        log.error( missingReplicas["Message"] )
      else:
        missingReplicas = missingReplicas["Value"]
        for opFile in operation:
          # Actually the condition below should never happen... Change printout for checking
          if opFile.LFN not in missingReplicas and opFile.Status not in ( 'Done', 'Failed' ):
            log.warn( "File should be set Done! %s is replicated at all targets" % opFile.LFN )
            opFile.Status = "Done"

        if missingReplicas:
          # Check if these files are in the FTSDB
          ftsFiles = self.ftsClient().getAllFTSFilesForRequest( request.RequestID )
          if not ftsFiles['OK']:
            log.error( ftsFiles['Message'] )
          else:
            ftsFiles = ftsFiles['Value']
            ftsLfns = set( [ftsFile.LFN for ftsFile in ftsFiles] )
            # Recover files not in FTSDB
            toSchedule = set( missingReplicas ) - ftsLfns
            if toSchedule:
              log.warn( '%d files in operation are not in FTSDB, reset them Waiting' % len( toSchedule ) )
              for opFile in operation:
                if opFile.LFN in toSchedule and opFile.Status == 'Scheduled':
                  opFile.Status = 'Waiting'
            # Recover files with target not in FTSDB
            toSchedule = set( [missing for missing, missingSEs in missingReplicas.items()
                              if not [ftsFile for ftsFile in ftsFiles
                                      if ftsFile.LFN == missing and ftsFile.TargetSE in missingSEs]] )
            if toSchedule:
              log.warn( '%d targets in operation are not in FTSDB, reset files Waiting' % len( toSchedule ) )
              for opFile in operation:
                if opFile.LFN in toSchedule and opFile.Status == 'Scheduled':
                  opFile.Status = 'Waiting'
            # identify missing LFNs that are waiting for a replication which is finished
            for ftsFile in [f for f in ftsFiles if f.LFN in missingReplicas and f.Status.startswith( 'Waiting#' )]:
              targetSE = ftsFile.Status.split( '#' )[1]
              finishedFiles = [f for f in ftsFiles if
                               f.LFN == ftsFile.LFN and
                               f.Status == 'Finished' and
                               f.TargetSE == targetSE and
                               f not in ftsFilesDict['toUpdate']]
              if finishedFiles:
                log.warn( "%s is %s while replication was Finished to %s, update" % ( ftsFile.LFN, ftsFile.Status, targetSE ) )
                ftsFilesDict['toUpdate'] += finishedFiles
            # identify Finished transfer for which the replica is still missing
            for ftsFile in [f for f in ftsFiles if f.Status == 'Finished' and f.TargetSE in missingReplicas.get( f.LFN, [] ) and f not in ftsFilesDict['toRegister'] ]:
              # Check if there is a registration operation for that file and that target
              regOp = [op for op in request if
                       op.Type == 'RegisterReplica' and
                       op.TargetSE == ftsFile.TargetSE and
                       [f for f in op if f.LFN == ftsFile.LFN]]
              if not regOp:
                ftsFilesDict['toReschedule'].append( ftsFile )

      toFail = ftsFilesDict.get( "toFail", [] )
      toReschedule = ftsFilesDict.get( "toReschedule", [] )
      toSubmit = ftsFilesDict.get( "toSubmit", [] )
      toRegister = ftsFilesDict.get( "toRegister", [] )
      toUpdate = ftsFilesDict.get( "toUpdate", [] )

      # # PHASE TWO = Failed files? -> make request Failed and return
      if toFail:
        log.error( "==> found %s 'Failed' FTSFiles, but maybe other files can be processed..." % len( toFail ) )
        for opFile in operation:
          for ftsFile in toFail:
            if opFile.FileID == ftsFile.FileID:
              opFile.Error = ftsFile.Error
              opFile.Status = "Failed"
        operation.Error = "%s files are missing any replicas" % len( toFail )
        # # requets.Status should be Failed if all files in the operation "Failed"
        if request.Status == "Failed":
          request.Error = "ReplicateAndRegister %s failed" % operation.Order
          log.error( "request is set to 'Failed'" )
          # # putRequest is done by the finally: clause... Not good to do it twice
          raise escapeTry

      # # PHASE THREE - update Waiting#TargetSE FTSFiles
      if toUpdate:
        log.info( "==> found %s possible FTSFiles to update..." % ( len( toUpdate ) ) )
        byTarget = {}
        for ftsFile in toUpdate:
          byTarget.setdefault( ftsFile.TargetSE, [] ).append( ftsFile.FileID )
        for targetSE, fileIDList in byTarget.items():
          update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList )
          if not update["OK"]:
            log.error( "update FTSFiles failed:", update["Message"] )

      # # PHASE FOUR - add 'RegisterReplica' Operations
      if toRegister:
        log.info( "==> found %d Files waiting for registration, adding 'RegisterReplica' operations" % len( toRegister ) )
        registerFiles = self.__insertRegisterOperation( request, operation, toRegister )
        if not registerFiles["OK"]:
          log.error( "unable to create 'RegisterReplica' operations:", registerFiles["Message"] )
        # if request.Status == "Waiting":
        #  log.info( "request is in 'Waiting' state, will put it back to RMS" )
        #  return self.putRequest( request )

      # # PHASE FIVE - reschedule operation files
      if toReschedule:
        log.info( "==> found %s Files to reschedule" % len( toReschedule ) )
        rescheduleFiles = self.__reschedule( request, operation, toReschedule )
        if not rescheduleFiles["OK"]:
          log.error( rescheduleFiles["Message"] )

      # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs. We get also Failed files to recover them if needed
      ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, [ "Waiting", "Failed", 'Submitted', 'Canceled' ] )
      if not ftsFiles["OK"]:
        log.error( ftsFiles["Message"] )
      else:
        retryIds = set ( [ ftsFile.FTSFileID for ftsFile in toSubmit ] )
        for ftsFile in ftsFiles["Value"]:
          if ftsFile.FTSFileID not in retryIds:
            if ftsFile.Status in ( 'Failed', 'Canceled' ):
              # If the file was not unrecoverable failed and is not yet set toSubmit
              _reschedule, submit, _fail = self.__checkFailed( ftsFile )
            elif ftsFile.Status == 'Submitted':
              if ftsFile.FTSGUID not in [job.FTSGUID for job in ftsJobs]:
                log.warn( 'FTS GUID %s not found in FTS jobs, resubmit file transfer' % ftsFile.FTSGUID )
                ftsFile.Status = 'Waiting'
                submit = True
              else:
                submit = False
            else:
              submit = True
            if submit:
              toSubmit.append( ftsFile )
              retryIds.add( ftsFile.FTSFileID )

      # # submit new ftsJobs
      if toSubmit:
        if request.Status != 'Scheduled':
          log.info( "Found %d FTSFiles to submit while request is no longer in Scheduled status (%s)" \
                    % ( len( toSubmit ), request.Status ) )
        else:
          self.__checkDuplicates( request.RequestName, toSubmit )
          log.info( "==> found %s FTSFiles to submit" % len( toSubmit ) )
          submit = self.__submit( request, operation, toSubmit )
          if not submit["OK"]:
            log.error( submit["Message"] )
          else:
            ftsJobs += submit["Value"]

      # # status change? - put back request
      if request.Status != "Scheduled":
        log.info( "request no longer in 'Scheduled' state (%s), will put it back to RMS" % request.Status )

    except escapeTry:
      # This clause is raised when one wants to return from within the try: clause
      pass
    except Exception, exceptMessage:
      log.exception( "Exception in processRequest", lException = exceptMessage )
    finally:
コード例 #12
0
ファイル: RemovalAgent.py プロジェクト: closier/DIRAC
class RemovalAgent( AgentModule, RequestAgentMixIn ):
  """
    This Agent takes care of executing "removal" request from the RequestManagement system
  """

  def __init__( self, *args ):
    """
    Initialize the base class and define some extra data members
    """
    AgentModule.__init__( self, *args )
    self.requestDBClient = None
    self.replicaManager = None
    self.maxNumberOfThreads = 4
    self.maxRequestsInQueue = 100
    self.threadPool = None
    self.timeOutCounter = 0
    self.pendingRequests = True

  def initialize( self ):
    """
      Called by the framework upon startup, before any cycle (execute method bellow)
    """
    self.requestDBClient = RequestClient()
    # the RequestAgentMixIn needs the capitalized version, until is is fixed keep this.
    self.RequestDBClient = self.requestDBClient
    self.replicaManager = ReplicaManager()

    gMonitor.registerActivity( "Iteration", "Agent Loops", "RemovalAgent", "Loops/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "Execute", "Request Processed", "RemovalAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "Done", "Request Completed", "RemovalAgent", "Requests/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "PhysicalRemovalAtt", "Physical removals attempted",
                               "RemovalAgent", "Removal/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "PhysicalRemovalDone", "Successful physical removals",
                               "RemovalAgent", "Removal/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "PhysicalRemovalFail", "Failed physical removals",
                               "RemovalAgent", "Removal/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "PhysicalRemovalSize", "Physically removed size",
                               "RemovalAgent", "Bytes", gMonitor.OP_ACUM )

    gMonitor.registerActivity( "ReplicaRemovalAtt", "Replica removal attempted",
                               "RemovalAgent", "Removal/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "ReplicaRemovalDone", "Successful replica removals",
                               "RemovalAgent", "Removal/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "ReplicaRemovalFail", "Failed replica removals",
                               "RemovalAgent", "Removal/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "RemoveFileAtt", "File removal attempted",
                               "RemovalAgent", "Removal/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RemoveFileDone", "File removal done",
                               "RemovalAgent", "Removal/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RemoveFileFail", "File removal failed",
                               "RemovalAgent", "Removal/min", gMonitor.OP_SUM )

    self.maxNumberOfThreads = self.am_getOption( 'NumberOfThreads', self.maxNumberOfThreads )
    self.maxRequestsInQueue = self.am_getOption( 'RequestsInQueue', self.maxRequestsInQueue )
    self.threadPool = ThreadPool( 1, self.maxNumberOfThreads, self.maxRequestsInQueue )

    # Set the ThreadPool in daemon mode to process new ThreadedJobs as they are inserted
    self.threadPool.daemonize()

    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    return S_OK()

  def execute( self ):
    """
    Fill the TreadPool with ThreadJobs
    """
    self.pendingRequests = True
    while self.pendingRequests:
      requestExecutor = ThreadedJob( self.executeRequest )
      ret = self.threadPool.queueJob( requestExecutor )
      if not ret['OK']:
        break
      time.sleep( 0.1 )

    if self.timeOutCounter:
      gLogger.error( 'Timeouts during removal execution:', self.timeOutCounter )

    return S_OK()

  def executeRequest( self ):
    """
    Do the actual work in the Thread
    """
    ################################################
    # Get a request from request DB
    gMonitor.addMark( "Iteration", 1 )
    res = self.requestDBClient.getRequest( 'removal' )
    if not res['OK']:
      gLogger.info( "RemovalAgent.execute: Failed to get request from database." )
      return S_OK()
    elif not res['Value']:
      gLogger.info( "RemovalAgent.execute: No requests to be executed found." )
      self.pendingRequests = False
      return S_OK()
    requestString = res['Value']['RequestString']
    requestName = res['Value']['RequestName']
    sourceServer = res['Value']['Server']
    try:
      jobID = int( res['Value']['JobID'] )
    except ValueError:
      jobID = 0
    gLogger.info( "RemovalAgent.execute: Obtained request %s" % requestName )

    try:

      result = self.requestDBClient.getCurrentExecutionOrder( requestName, sourceServer )
      if result['OK']:
        currentOrder = result['Value']
      else:
        gLogger.error( 'Can not get the request execution order' )
        self.requestDBClient.updateRequest( requestName, requestString, sourceServer )
        return S_OK( 'Can not get the request execution order' )

      oRequest = RequestContainer( request = requestString )

      ################################################
      # Find the number of sub-requests from the request
      res = oRequest.getNumSubRequests( 'removal' )
      if not res['OK']:
        errStr = "RemovalAgent.execute: Failed to obtain number of removal subrequests."
        gLogger.error( errStr, res['Message'] )
        return S_OK()
      gLogger.info( "RemovalAgent.execute: Found %s sub requests." % res['Value'] )

      ################################################
      # For all the sub-requests in the request
      modified = False
      for ind in range( res['Value'] ):
        gMonitor.addMark( "Execute", 1 )
        gLogger.info( "RemovalAgent.execute: Processing sub-request %s." % ind )
        subRequestAttributes = oRequest.getSubRequestAttributes( ind, 'removal' )['Value']
        subExecutionOrder = int( subRequestAttributes['ExecutionOrder'] )
        subStatus = subRequestAttributes['Status']
        if subStatus == 'Waiting' and subExecutionOrder <= currentOrder:
          subRequestFiles = oRequest.getSubRequestFiles( ind, 'removal' )['Value']
          operation = subRequestAttributes['Operation']

          ################################################
          #  If the sub-request is a physical removal operation
          if operation == 'physicalRemoval':
            gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation )
            diracSEs = subRequestAttributes['TargetSE'].split( ',' )
            physicalFiles = []
            pfnToLfn = {}
            for subRequestFile in subRequestFiles:
              if subRequestFile['Status'] == 'Waiting':
                pfn = str( subRequestFile['PFN'] )
                lfn = str( subRequestFile['LFN'] )
                pfnToLfn[pfn] = lfn
                physicalFiles.append( pfn )
            gMonitor.addMark( 'PhysicalRemovalAtt', len( physicalFiles ) )
            failed = {}
            errMsg = {}
            for diracSE in diracSEs:
              res = self.replicaManager.removeStorageFile( physicalFiles, diracSE )
              if res['OK']:
                for pfn in res['Value']['Failed'].keys():
                  if not failed.has_key( pfn ):
                    failed[pfn] = {}
                  failed[pfn][diracSE] = res['Value']['Failed'][pfn]
              else:
                errMsg[diracSE] = res['Message']
                for pfn in physicalFiles:
                  if not failed.has_key( pfn ):
                    failed[pfn] = {}
                  failed[pfn][diracSE] = 'Completely'
            # Now analyse the results
            failedPFNs = failed.keys()
            pfnsOK = [pfn for pfn in physicalFiles if not pfn in failedPFNs]
            gMonitor.addMark( 'PhysicalRemovalDone', len( pfnsOK ) )
            for pfn in pfnsOK:
              gLogger.info( "RemovalAgent.execute: Successfully removed %s at %s" % ( pfn, str( diracSEs ) ) )
              res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', pfnToLfn[pfn], 'Status', 'Done' )
              if not res['OK']:
                gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', pfnToLfn[pfn] ) )
              modified = True
            if failed:
              gMonitor.addMark( 'PhysicalRemovalFail', len( failedPFNs ) )
              for pfn in failedPFNs:
                for diracSE in failed[pfn].keys():
                  if type( failed[pfn][diracSE] ) in StringTypes:
                    if re.search( 'no such file or directory', failed[pfn][diracSE].lower() ):
                      gLogger.info( "RemovalAgent.execute: File did not exist.", pfn )
                      res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', pfnToLfn[pfn], 'Status', 'Done' )
                      if not res['OK']:
                        gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', pfnToLfn[pfn] ) )
                      modified = True
                    else:
                      gLogger.info( "RemovalAgent.execute: Failed to remove file.", "%s at %s - %s" % ( pfn, diracSE, failed[pfn][diracSE] ) )
            if errMsg:
              for diracSE in errMsg.keys():
                errStr = "RemovalAgent.execute: Completely failed to remove replicas. At %s", diracSE
                gLogger.error( errStr, errMsg[diracSE] )


          ################################################
          #  If the sub-request is a physical removal operation
          elif operation == 'removeFile':
            gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation )
            lfns = []
            for subRequestFile in subRequestFiles:
              if subRequestFile['Status'] == 'Waiting':
                lfn = str( subRequestFile['LFN'] )
                lfns.append( lfn )
            gMonitor.addMark( 'RemoveFileAtt', len( lfns ) )
            res = self.replicaManager.removeFile( lfns )
            if res['OK']:
              gMonitor.addMark( 'RemoveFileDone', len( res['Value']['Successful'].keys() ) )
              for lfn in res['Value']['Successful'].keys():
                gLogger.info( "RemovalAgent.execute: Successfully removed %s." % lfn )
                result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' )
                if not result['OK']:
                  gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) )
                modified = True
              gMonitor.addMark( 'RemoveFileFail', len( res['Value']['Failed'].keys() ) )
              for lfn in res['Value']['Failed'].keys():
                if type( res['Value']['Failed'][lfn] ) in StringTypes:
                  if re.search( 'no such file or directory', res['Value']['Failed'][lfn].lower() ):
                    gLogger.info( "RemovalAgent.execute: File did not exist.", lfn )
                    result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' )
                    if not result['OK']:
                      gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) )
                    modified = True
                  else:
                    gLogger.info( "RemovalAgent.execute: Failed to remove file:",
                                  "%s %s" % ( lfn, res['Value']['Failed'][lfn] ) )
            else:
              gMonitor.addMark( 'RemoveFileFail', len( lfns ) )
              errStr = "RemovalAgent.execute: Completely failed to remove files files."
              gLogger.error( errStr, res['Message'] )

          ################################################
          #  If the sub-request is a physical removal operation
          elif operation == 'replicaRemoval':
            gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation )
            diracSEs = subRequestAttributes['TargetSE'].split( ',' )
            lfns = []
            for subRequestFile in subRequestFiles:
              if subRequestFile['Status'] == 'Waiting':
                lfn = str( subRequestFile['LFN'] )
                lfns.append( lfn )
            gMonitor.addMark( 'ReplicaRemovalAtt', len( lfns ) )

            failed = {}
            errMsg = {}
            for diracSE in diracSEs:
              res = self.replicaManager.removeReplica( diracSE, lfns )
              if res['OK']:
                for lfn in res['Value']['Failed'].keys():
                  errorMessage = str( res['Value']['Failed'][lfn] )
                  if errorMessage.find( 'Write access not permitted for this credential.' ) != -1:
                    if self.__getProxyAndRemoveReplica( diracSE, lfn ):
                      continue
                  if errorMessage.find( 'seconds timeout for "__gfal_wrapper" call' ) != -1:
                    self.timeOutCounter += 1
                  if not failed.has_key( lfn ):
                    failed[lfn] = {}
                  failed[lfn][diracSE] = res['Value']['Failed'][lfn]
              else:
                errMsg[diracSE] = res['Message']
                for lfn in lfns:
                  if not failed.has_key( lfn ):
                    failed[lfn] = {}
                  failed[lfn][diracSE] = 'Completely'
            # Now analyse the results
            failedLFNs = failed.keys()
            lfnsOK = [lfn for lfn in lfns if not lfn in failedLFNs]
            gMonitor.addMark( 'ReplicaRemovalDone', len( lfnsOK ) )
            for lfn in lfnsOK:
              gLogger.info( "RemovalAgent.execute: Successfully removed %s at %s" % ( lfn, str( diracSEs ) ) )
              res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' )
              if not res['OK']:
                gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) )
              modified = True
            if failed:
              gMonitor.addMark( 'PhysicalRemovalFail', len( failedLFNs ) )
              for lfn in failedLFNs:
                for diracSE in failed[lfn].keys():
                  if type( failed[lfn][diracSE] ) in StringTypes:
                    if re.search( 'no such file or directory', failed[lfn][diracSE].lower() ):
                      gLogger.info( "RemovalAgent.execute: File did not exist.", lfn )
                      res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' )
                      if not res['OK']:
                        gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) )
                      modified = True
                    else:
                      gLogger.info( "RemovalAgent.execute: Failed to remove file.", "%s at %s - %s" % ( lfn, diracSE, failed[lfn][diracSE] ) )
            if errMsg:
              for diracSE in errMsg.keys():
                errStr = "RemovalAgent.execute: Completely failed to remove replicas. At %s", diracSE
                gLogger.error( errStr, errMsg[diracSE] )

          ################################################
          #  If the sub-request is a request to the online system to retransfer
          elif operation == 'reTransfer':
            gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation )
            diracSE = subRequestAttributes['TargetSE']
            for subRequestFile in subRequestFiles:
              if subRequestFile['Status'] == 'Waiting':
                pfn = str( subRequestFile['PFN'] )
                lfn = str( subRequestFile['LFN'] )
                res = self.replicaManager.onlineRetransfer( diracSE, pfn )
                if res['OK']:
                  if res['Value']['Successful'].has_key( pfn ):
                    gLogger.info( "RemovalAgent.execute: Successfully requested retransfer of %s." % pfn )
                    result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' )
                    if not result['OK']:
                      gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) )
                    modified = True
                  else:
                    errStr = "RemovalAgent.execute: Failed to request retransfer."
                    gLogger.error( errStr, "%s %s %s" % ( pfn, diracSE, res['Value']['Failed'][pfn] ) )
                else:
                  errStr = "RemovalAgent.execute: Completely failed to request retransfer."
                  gLogger.error( errStr, res['Message'] )
              else:
                gLogger.info( "RemovalAgent.execute: File already completed." )

          ################################################
          #  If the sub-request is none of the above types
          else:
            gLogger.error( "RemovalAgent.execute: Operation not supported.", operation )

          ################################################
          #  Determine whether there are any active files
          if oRequest.isSubRequestEmpty( ind, 'removal' )['Value']:
            oRequest.setSubRequestStatus( ind, 'removal', 'Done' )
            gMonitor.addMark( "Done", 1 )

        ################################################
        #  If the sub-request is already in terminal state
        else:
          gLogger.info( "RemovalAgent.execute:",
                        "Sub-request %s is status '%s' and not to be executed." %
                        ( ind, subRequestAttributes['Status'] ) )

      ################################################
      #  Generate the new request string after operation
      newrequestString = oRequest.toXML()['Value']
    except:
      # if something fails return the original request back to the server 
      res = self.requestDBClient.updateRequest( requestName, requestString, sourceServer )
      return S_OK()

    res = self.requestDBClient.updateRequest( requestName, newrequestString, sourceServer )

    if modified and jobID:
      result = self.finalizeRequest( requestName, jobID, sourceServer )

    return S_OK()

  def __getProxyAndRemoveReplica( self, diracSE, lfn ):
    """
    get a proxy from the owner of the file and try to remove it
    returns True if it succeeds, False otherwise
    """

    result = self.replicaManager.getCatalogDirectoryMetadata( lfn, singleFile = True )
    if not result[ 'OK' ]:
      gLogger.error( "Could not get metadata info", result[ 'Message' ] )
      return False
    ownerRole = result[ 'Value' ][ 'OwnerRole' ]
    ownerDN = result[ 'Value' ][ 'OwnerDN' ]
    if ownerRole[0] != "/":
      ownerRole = "/%s" % ownerRole

    userProxy = ''
    for ownerGroup in Registry.getGroupsWithVOMSAttribute( ownerRole ):
      result = gProxyManager.downloadVOMSProxy( ownerDN, ownerGroup, limited = True,
                                                requiredVOMSAttribute = ownerRole )
      if not result[ 'OK' ]:
        gLogger.verbose ( 'Failed to retrieve voms proxy for %s : %s:' % ( ownerDN, ownerRole ),
                          result[ 'Message' ] )
        continue
      userProxy = result[ 'Value' ]
      gLogger.verbose( "Got proxy for %s@%s [%s]" % ( ownerDN, ownerGroup, ownerRole ) )
      break
    if not userProxy:
      return False

    result = userProxy.dumpAllToFile()
    if not result[ 'OK' ]:
      gLogger.verbose( result[ 'Message' ] )
      return False

    upFile = result[ 'Value' ]
    prevProxyEnv = os.environ[ 'X509_USER_PROXY' ]
    os.environ[ 'X509_USER_PROXY' ] = upFile

    try:
      res = self.replicaManager.removeReplica( diracSE, lfn )
      if res['OK'] and lfn in res[ 'Value' ]['Successful']:
        gLogger.verbose( 'Removed %s from %s' % ( lfn, diracSE ) )
        return True
    finally:
      os.environ[ 'X509_USER_PROXY' ] = prevProxyEnv
      os.unlink( upFile )

    return False

  def finalize( self ):
    """
    Called by the Agent framework to cleanly end execution.
    In this case this module will wait until all pending ThreadedJbos in the
    ThreadPool get executed
    """

    self.threadPool.processAllResults()
    return S_OK()
コード例 #13
0
class OutputDataExecutor:

  def __init__( self, csPath = "" ):
    self.log = gLogger.getSubLogger( "OutputDataExecutor" )
    if not csPath:
      vo = gConfig.getValue( "/DIRAC/VirtualOrganization", "" )
      self.__transfersCSPath = '/Operations/%s/OutputData' % vo
    else:
      self.__transfersCSPath = csPath
    self.log.verbose( "Reading transfer paths from %s" % self.__transfersCSPath )
    self.__requiredCSOptions = ['InputPath', 'InputFC', 'OutputPath', 'OutputFC', 'OutputSE']

    self.__threadPool = ThreadPool( gConfig.getValue( "%s/MinTransfers" % self.__transfersCSPath, 1 ),
                                    gConfig.getValue( "%s/MaxTransfers" % self.__transfersCSPath, 4 ),
                                    gConfig.getValue( "%s/MaxQueuedTransfers" % self.__transfersCSPath, 100 ) )
    self.__threadPool.daemonize()
    self.__processingFiles = set()
    self.__okTransferredFiles = 0
    self.__okTransferredBytes = 0
    self.__failedFiles = {}

  def getNumOKTransferredFiles( self ):
    return self.__okTransferredFiles

  def getNumOKTransferredBytes( self ):
    return self.__okTransferredBytes

  def transfersPending( self ):
    return self.__threadPool.isWorking()

  def getDefinedTransferPaths( self ):
    result = gConfig.getSections( self.__transfersCSPath )
    if not result['OK']:
      self.log.info( 'No Input/Output Pair defined in CS' )
      return S_OK()

    pathList = result['Value']

    tPaths = {}
    for name in pathList:
      csPath = self.__transfersCSPath + '/%s' % name
      result = gConfig.getOptionsDict( csPath )
      if not result['OK']:
        continue
      transferDict = result['Value']
      ok = True
      for i in self.__requiredCSOptions:
        if i not in transferDict:
          self.log.error( 'Missing Option %s in %s' % ( i, csPath ) )
          ok = False
          break
      if not ok:
        continue
      tPaths[ name ] = transferDict

    return S_OK( tPaths )

  def getNumLocalOutgoingFiles( self ):
    result = self.getDefinedTransferPaths()
    if not result[ 'OK' ]:
      return 0
    localOutgoing = 0
    tPaths = result[ 'Value' ]
    for name in tPaths:
      transferDict = tPaths[ name ]
      if 'LocalDisk' != transferDict['InputFC']:
        continue
      localOutgoing += len( self.getOutgoingFiles( transferDict ) )
    return localOutgoing

  def getOutgoingFiles( self, transferDict ):
    """
    Get list of files to be processed from InputPath
    """
    inputFCName = transferDict['InputFC']
    inputPath = transferDict['InputPath']

    if inputFCName == 'LocalDisk':
      files = []
      try:
        for fileName in os.listdir( inputPath ):
          if os.path.isfile( os.path.join( inputPath, fileName ) ):
            files.append( fileName )
      except:
        pass
      return files

    inputFC = FileCatalog( [inputFCName] )
    result = inputFC.listDirectory( inputPath, True )

    if not result['OK']:
      self.log.error( result['Message'] )
      return []
    if not inputPath in result['Value']['Successful']:
      self.log.error( result['Value']['Failed'][inputPath] )
      return []

    subDirs = result['Value']['Successful'][inputPath]['SubDirs']
    files = result['Value']['Successful'][inputPath]['Files']
    for subDir in subDirs:
      self.log.info( 'Ignoring subdirectory:', subDir )
    return files.keys()

  def checkForTransfers( self ):
    """
    Check for transfers to do and start them
    """
    result = self.getDefinedTransferPaths()
    if not result[ 'OK' ]:
      return result
    tPaths = result[ 'Value' ]
    for name in tPaths:
      transferPath = tPaths[ name ]
      self.log.verbose( "Checking %s transfer path" % name )
      filesToTransfer = self.getOutgoingFiles( tPaths[ name ] )
      self.log.info( "Transfer path %s has %d files" % ( name, len( filesToTransfer ) ) )
      ret = self.__addFilesToThreadPool( filesToTransfer, transferPath )
      if not ret['OK']:
        # The thread pool got full 
        break

  def processAllPendingTransfers( self ):
    self.__threadPool.processAllResults()

  @transferSync
  def __addFilesToThreadPool( self, files, transferDict ):
    for fileName in files:
      fileName = os.path.basename( fileName )
      if fileName in self.__processingFiles:
        continue
      self.__processingFiles.add( fileName )
      time.sleep( 1 )
      ret = self.__threadPool.generateJobAndQueueIt( self.__transferIfNotRegistered,
                                            args = ( fileName, transferDict ),
                                            oCallback = self.transferCallback,
                                            blocking = False )
      if not ret['OK']:
        # The thread pool got full 
        return ret
    return S_OK()

  def __transferIfNotRegistered( self, file, transferDict ):
    result = self.isRegisteredInOutputCatalog( file, transferDict )
    if not result[ 'OK' ]:
      self.log.error( result[ 'Message' ] )
      return result
    #Already registered. Need to delete
    if result[ 'Value' ]:
      self.log.info( "Transfer file %s is already registered in the output catalog" % file )
      #Delete
      filePath = os.path.join( transferDict[ 'InputPath' ], file )
      if transferDict[ 'InputFC' ] == 'LocalDisk':
        os.unlink( filePath )
      #FIXME: what is inFile supposed to be ??
      else:
        inputFC = FileCatalog( [ transferDict['InputFC'] ] )
        replicaDict = inputFC.getReplicas( filePath )
        if not replicaDict['OK']:
          self.log.error( "Error deleting file", replicaDict['Message'] )
        elif not inFile in replicaDict['Value']['Successful']:
          self.log.error( "Error deleting file", replicaDict['Value']['Failed'][inFile] )
        else:
          seList = replicaDict['Value']['Successful'][inFile].keys()
          for se in seList:
            se = StorageElement( se )
            self.log.info( 'Removing from %s:' % se.name, inFile )
            se.removeFile( inFile )
          inputFC.removeFile( file )
      self.log.info( "File %s deleted from %s" % ( file, transferDict[ 'InputFC' ] ) )
      self.__processingFiles.discard( file )
      return S_OK( file )
    #Do the transfer
    return self.__retrieveAndUploadFile( file, transferDict )

  def isRegisteredInOutputCatalog( self, file, transferDict ):
    fc = FileCatalog( [ transferDict[ 'OutputFC' ] ] )
    lfn = os.path.join( transferDict['OutputPath'], os.path.basename( file ) )
    result = fc.getReplicas( lfn )
    if not result[ 'OK' ]:
      return result
    if lfn not in result[ 'Value' ][ 'Successful' ]:
      return S_OK( False )
    replicas = result[ 'Value' ][ 'Successful' ][ lfn ]
    for seName in List.fromChar( transferDict[ 'OutputSE' ], "," ):
      if seName in replicas:
        self.log.verbose( "Transfer file %s is already registered in %s SE" % ( file, seName ) )
        return S_OK( True )
    return S_OK( False )

  def __retrieveAndUploadFile( self, file, outputDict ):
    """
    Retrieve, Upload, and remove
    """
    fileName = file
    inputPath = outputDict['InputPath']
    inputFCName = outputDict['InputFC']
    inBytes = 0
    if inputFCName == 'LocalDisk':
      inFile = file
      file = os.path.join( inputPath, file )
    else:
      inputFC = FileCatalog( [inputFCName] )

      inFile = os.path.join( inputPath, file )
      replicaDict = inputFC.getReplicas( inFile )
      if not replicaDict['OK']:
        self.log.error( replicaDict['Message'] )
        return S_ERROR( fileName )
      if not inFile in replicaDict['Value']['Successful']:
        self.log.error( replicaDict['Value']['Failed'][inFile] )
        return S_ERROR( fileName )
      seList = replicaDict['Value']['Successful'][inFile].keys()

      inputSE = StorageElement( seList[0] )
      self.log.info( 'Retrieving from %s:' % inputSE.name, inFile )
      # ret = inputSE.getFile( inFile )
      # lcg_util binding prevent multithreading, use subprocess instead
      res = pythonCall( 2 * 3600, inputSE.getFile, inFile )
      if not res['OK']:
        self.log.error( res['Message'] )
        return S_ERROR( fileName )
      ret = res['Value']
      if not ret['OK']:
        self.log.error( ret['Message'] )
        return S_ERROR( fileName )
      if not inFile in ret['Value']['Successful']:
        self.log.error( ret['Value']['Failed'][inFile] )
        return S_ERROR( fileName )

    if os.path.isfile( file ):
      inBytes = os.stat( file )[6]

    outputPath = outputDict['OutputPath']
    outputFCName = outputDict['OutputFC']
    replicaManager = ReplicaManager()
    outFile = os.path.join( outputPath, os.path.basename( file ) )
    transferOK = False
    for outputSEName in List.fromChar( outputDict['OutputSE'], "," ):
      outputSE = StorageElement( outputSEName )
      self.log.info( 'Trying to upload to %s:' % outputSE.name, outFile )
      # ret = replicaManager.putAndRegister( outFile, os.path.realpath( file ), outputSE.name, catalog=outputFCName )
      # lcg_util binding prevent multithreading, use subprocess instead
      result = pythonCall( 2 * 3600, replicaManager.putAndRegister, outFile, os.path.realpath( file ), outputSE.name, catalog = outputFCName )
      if result['OK'] and result['Value']['OK']:
        if outFile in result['Value']['Value']['Successful']:
          transferOK = True
          break
        else:
          self.log.error( result['Value']['Value']['Failed'][outFile] )
      else:
        if result['OK']:
          self.log.error( result['Value']['Message'] )
        else:
          self.log.error( result['Message'] )

    if not transferOK:
      return S_ERROR( fileName )

    if result['OK'] or not inputFCName == 'LocalDisk':
      os.unlink( file )

    if not result['OK']:
      self.log.error( ret['Message'] )
      return S_ERROR( fileName )

    self.log.info( "Finished transferring %s [%s bytes]" % ( inFile, inBytes ) )
    self.__okTransferredFiles += 1
    self.__okTransferredBytes += inBytes

    if inputFCName == 'LocalDisk':
      return S_OK( fileName )

    # Now the file is on final SE/FC, remove from input SE/FC
    for se in seList:
      se = StorageElement( se )
      self.log.info( 'Removing from %s:' % se.name, inFile )
      se.removeFile( inFile )

    inputFC.removeFile( inFile )

    return S_OK( fileName )

  @transferSync
  def transferCallback( self, threadedJob, submitResult ):
    if not submitResult['OK']:
      fileName = submitResult['Message']
      if fileName not in self.__failedFiles:
        self.__failedFiles[fileName] = 0
      self.__failedFiles[fileName] += 1
    else:
      fileName = submitResult['Value']
      if fileName in self.__failedFiles:
        del self.__failedFiles[fileName]
    #Take out from processing files
    if fileName in self.__processingFiles:
      self.__processingFiles.discard( fileName )
コード例 #14
0
ファイル: FTSAgent.py プロジェクト: yuw726/DIRAC
class FTSAgent( AgentModule ):
  """
  .. class:: FTSAgent

  Agent propagating Scheduled request to Done or Failed state in the FTS system.

  Requests and associated FTSJobs (and so FTSFiles) are kept in cache.

  """
  # # fts graph refresh in seconds
  FTSGRAPH_REFRESH = FTSHistoryView.INTERVAL / 2
  # # SE R/W access refresh in seconds
  RW_REFRESH = 600
  # # placeholder for max job per channel
  MAX_ACTIVE_JOBS = 50
  # # min threads
  MIN_THREADS = 1
  # # max threads
  MAX_THREADS = 10
  # # files per job
  MAX_FILES_PER_JOB = 100
  # # MAX FTS transfer per FTSFile
  MAX_ATTEMPT = 256
  # # stage flag
  STAGE_FILES = False
  # # replica manager
  __replicaManager = None
  # # placeholder for FTS client
  __ftsClient = None
  # # placeholder for request client
  __requestClient = None
  # # placeholder for resources helper
  __resources = None
  # # placeholder for RSS client
  __rssClient = None
  # # placeholder for FTSGraph
  __ftsGraph = None
  # # graph regeneration time delta
  __ftsGraphValidStamp = None
  # # r/w access valid stamp
  __rwAccessValidStamp = None
  # # placeholder for threadPool
  __threadPool = None
  # # update lock
  __updateLock = None
  # # se cache
  __seCache = dict()
  # # request cache
  __reqCache = dict()

  def updateLock( self ):
    """ update lock """
    if not self.__updateLock:
      self.__updateLock = LockRing().getLock( "FTSAgentLock" )
    return self.__updateLock

  @classmethod
  def requestClient( cls ):
    """ request client getter """
    if not cls.__requestClient:
      cls.__requestClient = ReqClient()
    return cls.__requestClient

  @classmethod
  def ftsClient( cls ):
    """ FTS client """
    if not cls.__ftsClient:
      cls.__ftsClient = FTSClient()
    return cls.__ftsClient

  @classmethod
  def replicaManager( cls ):
    """ replica manager getter """
    if not cls.__replicaManager:
      cls.__replicaManager = ReplicaManager()
    return cls.__replicaManager

  @classmethod
  def rssClient( cls ):
    """ RSS client getter """
    if not cls.__rssClient:
      cls.__rssClient = ResourceStatus()
    return cls.__rssClient

  @classmethod
  def getSE( cls, seName ):
    """ keep SEs in cache """
    if seName not in cls.__seCache:
      cls.__seCache[seName] = StorageElement( seName )
    return cls.__seCache[seName]

  @classmethod
  def getRequest( cls, reqName ):
    """ keep Requests in cache """
    if reqName not in cls.__reqCache:
      getRequest = cls.requestClient().getRequest( reqName )
      if not getRequest["OK"]:
        return getRequest
      getRequest = getRequest["Value"]
      if not getRequest:
        return S_ERROR( "request of name '%s' not found in ReqDB" % reqName )
      cls.__reqCache[reqName] = getRequest

    return S_OK( cls.__reqCache[reqName] )

  @classmethod
  def putRequest( cls, request ):
    """ put request back to ReqDB

    :param Request request: Request instance

    also finalize request if status == Done
    """
    # # put back request
    put = cls.requestClient().putRequest( request )
    if not put["OK"]:
      return put
    # # finalize first is possible
    if request.Status == "Done" and request.JobID:
      finalizeRequest = cls.requestClient().finalizeRequest( request.RequestName, request.JobID )
      if not finalizeRequest["OK"]:
        request.Status = "Scheduled"
    # # del request from cache
    if request.RequestName in cls.__reqCache:
      del cls.__reqCache[ request.RequestName ]
    return S_OK()

  @classmethod
  def putFTSJobs( cls, ftsJobsList ):
    """ put back fts jobs to the FTSDB """
    for ftsJob in ftsJobsList:
      put = cls.ftsClient().putFTSJob( ftsJob )
      if not put["OK"]:
        return put
    return S_OK()

  @staticmethod
  def updateFTSFileDict( ftsFilesDict, toUpdateDict ):
    """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """
    for category, ftsFileList in ftsFilesDict.items():
      for ftsFile in toUpdateDict.get( category, [] ):
        if ftsFile not in ftsFileList:
          ftsFileList.append( ftsFile )
    return ftsFilesDict

#  def resources( self ):
#    """ resource helper getter """
#    if not self.__resources:
#      self.__resources = Resources()
#    return self.__resources

  def threadPool( self ):
    """ thread pool getter """
    if not self.__threadPool:
      self.__threadPool = ThreadPool( self.MIN_THREADS, self.MAX_THREADS )
      self.__threadPool.daemonize()
    return self.__threadPool

  def resetFTSGraph( self ):
    """ create fts graph """
    log = gLogger.getSubLogger( "ftsGraph" )

    ftsHistory = self.ftsClient().getFTSHistory()
    if not ftsHistory["OK"]:
      log.error( "unable to get FTS history: %s" % ftsHistory["Message"] )
      return ftsHistory
    ftsHistory = ftsHistory["Value"]

    try:
      self.updateLock().acquire()
      self.__ftsGraph = FTSGraph( "FTSGraph", ftsHistory )
    finally:
      self.updateLock().release()

    log.debug( "FTSSites: %s" % len( self.__ftsGraph.nodes() ) )
    for i, site in enumerate( self.__ftsGraph.nodes() ):
      log.debug( " [%02d] FTSSite: %-25s FTSServer: %s" % ( i, site.name, site.FTSServer ) )
    log.debug( "FTSRoutes: %s" % len( self.__ftsGraph.edges() ) )
    for i, route in enumerate( self.__ftsGraph.edges() ):
      log.debug( " [%02d] FTSRoute: %-25s Active FTSJobs (Max) = %s (%s)" % ( i,
                                                                             route.routeName,
                                                                             route.ActiveJobs,
                                                                             route.toNode.MaxActiveJobs ) )
    # # save graph stamp
    self.__ftsGraphValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH )

    # # refresh SE R/W access
    try:
      self.updateLock().acquire()
      self.__ftsGraph.updateRWAccess()
    finally:
      self.updateLock().release()
    # # save rw access stamp
    self.__rwAccessValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.RW_REFRESH )

    return S_OK()

  def initialize( self ):
    """ agent's initialization """

    log = self.log.getSubLogger( "initialize" )

    self.FTSGRAPH_REFRESH = self.am_getOption( "FTSGraphValidityPeriod", self.FTSGRAPH_REFRESH )
    log.info( "FTSGraph validity period       = %s s" % self.FTSGRAPH_REFRESH )
    self.RW_REFRESH = self.am_getOption( "RWAccessValidityPeriod", self.RW_REFRESH )
    log.info( "SEs R/W access validity period = %s s" % self.RW_REFRESH )

    self.STAGE_FILES = self.am_getOption( "StageFiles", self.STAGE_FILES )
    log.info( "Stage files before submission  = %s" % {True: "yes", False: "no"}[bool( self.STAGE_FILES )] )

    self.MAX_ACTIVE_JOBS = self.am_getOption( "MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS )
    log.info( "Max active FTSJobs/route       = %s" % self.MAX_ACTIVE_JOBS )
    self.MAX_FILES_PER_JOB = self.am_getOption( "MaxFilesPerJob", self.MAX_FILES_PER_JOB )
    log.info( "Max FTSFiles/FTSJob            = %d" % self.MAX_FILES_PER_JOB )

    self.MAX_ATTEMPT = self.am_getOption( "MaxTransferAttempts", self.MAX_ATTEMPT )
    log.info( "Max transfer attempts          = %s" % self.MAX_ATTEMPT )

    # # thread pool
    self.MIN_THREADS = self.am_getOption( "MinThreads", self.MIN_THREADS )
    self.MAX_THREADS = self.am_getOption( "MaxThreads", self.MAX_THREADS )
    minmax = ( abs( self.MIN_THREADS ), abs( self.MAX_THREADS ) )
    self.MIN_THREADS, self.MAX_THREADS = min( minmax ), max( minmax )
    log.info( "ThreadPool min threads         = %s" % self.MIN_THREADS )
    log.info( "ThreadPool max threads         = %s" % self.MAX_THREADS )

    log.info( "initialize: creation of FTSGraph..." )
    createGraph = self.resetFTSGraph()
    if not createGraph["OK"]:
      log.error( "initialize: %s" % createGraph["Message"] )
      return createGraph

    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )
    log.info( "will use DataManager proxy" )

    # # gMonitor stuff here
    gMonitor.registerActivity( "RequestsAtt", "Attempted requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RequestsOK", "Successful requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RequestsFail", "Failed requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSJobsSubAtt", "FTSJobs creation attempts",
                               "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsSubOK", "FTSJobs submitted successfully",
                               "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsSubFail", "FTSJobs submissions failed",
                               "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSJobsMonAtt", "FTSJobs monitored",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsMonOK", "FTSJobs monitored successfully",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsMonFail", "FTSJobs attempts failed",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSMonitorFail", "Failed FTS monitor executions",
                               "FTSAgent", "Execution/mins", gMonitor.OP_SUM )


    pollingTime = self.am_getOption( "PollingTime", 60 )
    for status in list( FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES ):
      gMonitor.registerActivity( "FTSJobs%s" % status, "FTSJobs %s" % status ,
                                 "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime )

    gMonitor.registerActivity( "FtSJobsPerRequest", "Average FTSJobs per request",
                               "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN )
    gMonitor.registerActivity( "FTSFilesPerJob", "FTSFiles per FTSJob",
                               "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN )
    gMonitor.registerActivity( "FTSSizePerJob", "Average FTSFiles size per FTSJob",
                               "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN )
    return S_OK()

  def finalize( self ):
    """ finalize processing """
    log = self.log.getSubLogger( "finalize" )
    for request in self.__reqCache.values():
      put = self.requestClient().putRequest( request )
      if not put["OK"]:
        log.error( "unable to put back request '%s': %s" % ( request.RequestName, put["Message"] ) )
    return S_OK()

  def execute( self ):
    """ one cycle execution """
    log = gLogger.getSubLogger( "execute" )
    # # reset FTSGraph if expired
    now = datetime.datetime.now()
    if now > self.__ftsGraphValidStamp:
      log.info( "resetting expired FTS graph..." )
      resetFTSGraph = self.resetFTSGraph()
      if not resetFTSGraph["OK"]:
        log.error( "FTSGraph recreation error: %s" % resetFTSGraph["Message"] )
        return resetFTSGraph
      self.__ftsGraphValidStamp = now + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH )
    # # update R/W access in FTSGraph if expired
    if now > self.__rwAccessValidStamp:
      log.info( "updating expired R/W access for SEs..." )
      try:
        self.updateLock().acquire()
        self.__ftsGraph.updateRWAccess()
      finally:
        self.updateLock().release()
        self.__rwAccessValidStamp = now + datetime.timedelta( seconds = self.RW_REFRESH )

    requestNames = self.requestClient().getRequestNamesList( [ "Scheduled" ] )
    if not requestNames["OK"]:
      log.error( "unable to read scheduled request names: %s" % requestNames["Message"] )
      return requestNames
    if not requestNames["Value"]:
      requestNames = self.__reqCache.keys()
    else:
      requestNames = [ req[0] for req in requestNames["Value"] ]
      requestNames = list( set ( requestNames + self.__reqCache.keys() ) )

    if not requestNames:
      log.info( "no 'Scheduled' requests to process" )
      return S_OK()

    log.info( "found %s requests to process:" % len( requestNames ) )
    log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) )
    log.info( " =>   new read from RMS: %s" % ( len( requestNames ) - len( self.__reqCache ) ) )

    for requestName in requestNames:
      request = self.getRequest( requestName )
      if not request["OK"]:
        log.error( request["Message"] )
        continue
      request = request["Value"]
      sTJId = request.RequestName
      while True:
        queue = self.threadPool().generateJobAndQueueIt( self.processRequest,
                                                         args = ( request, ),
                                                         sTJId = sTJId )
        if queue["OK"]:
          log.info( "request '%s' enqueued for execution" % sTJId )
          gMonitor.addMark( "RequestsAtt", 1 )
          break
        time.sleep( 1 )

    # # process all results
    self.threadPool().processAllResults()
    return S_OK()

  def processRequest( self, request ):
    """ process one request

    :param Request request: ReqDB.Request
    """
    log = self.log.getSubLogger( request.RequestName )

    operation = request.getWaiting()
    if not operation["OK"]:
      log.error( "unable to find 'Scheduled' ReplicateAndRegister operation in request" )
      return self.putRequest( request )
    operation = operation["Value"]
    if operation.Type != "ReplicateAndRegister":
      log.error( "operation to be executed is not a ReplicateAndRegister but %s" % operation.Type )
      return self.putRequest( request )
    if operation.Status != "Scheduled":
      log.error( "operation in a wrong state, expecting 'Scheduled', got %s" % operation.Status )
      return self.putRequest( request )

    # # select  FTSJobs, by default all in TRANS_STATES and INIT_STATES
    ftsJobs = self.ftsClient().getFTSJobsForRequest( request.RequestID )
    if not ftsJobs["OK"]:
      log.error( ftsJobs["Message"] )
      return ftsJobs
    ftsJobs = ftsJobs["Value"] if ftsJobs["Value"] else []

    # # dict keeping info about files to reschedule, submit, fail and register
    ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] )

    if ftsJobs:
      log.info( "==> found %s FTSJobs to monitor" % len( ftsJobs ) )
      # # PHASE 0 = monitor active FTSJobs
      for ftsJob in ftsJobs:
        monitor = self.__monitorJob( request, ftsJob )
        if not monitor["OK"]:
          log.error( "unable to monitor FTSJob %s: %s" % ( ftsJob.FTSJobID, monitor["Message"] ) )
          ftsJob.Status = "Submitted"
          continue
        ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, monitor["Value"] )

      log.info( "monitoring of FTSJobs completed" )
      for key, ftsFiles in ftsFilesDict.items():
        if ftsFiles:
          log.debug( " => %s FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) )

    # # PHASE ONE - check ready replicas
    missingReplicas = self.__checkReadyReplicas( request, operation )
    if not missingReplicas["OK"]:
      log.error( missingReplicas["Message"] )
    else:
      missingReplicas = missingReplicas["Value"]
      for opFile in operation:
        # Actually the condition below should never happen... Change printout for checking
        if opFile.LFN not in missingReplicas and opFile.Status != 'Done':
          log.warn( "Should be set! %s is replicated at all targets" % opFile.LFN )
          opFile.Status = "Done"

    toFail = ftsFilesDict.get( "toFail", [] )
    toReschedule = ftsFilesDict.get( "toReschedule", [] )
    toSubmit = ftsFilesDict.get( "toSubmit", [] )
    toRegister = ftsFilesDict.get( "toRegister", [] )
    toUpdate = ftsFilesDict.get( "toUpdate", [] )

    # # PHASE TWO = Failed files? -> make request Failed and return
    if toFail:
      log.error( "==> found %s 'Failed' FTSFiles, request execution cannot proceed..." % len( toFail ) )
      for opFile in operation:
        for ftsFile in toFail:
          if opFile.FileID == ftsFile.FileID:
            opFile.Error = ftsFile.Error
            opFile.Status = "Failed"
      operation.Error = "%s files are missing any replicas" % len( toFail )
      # # requets.Status should be Failed at this stage "Failed"
      if request.Status == "Failed":
        request.Error = "ReplicateAndRegister %s failed" % operation.Order
        log.error( "request is set to 'Failed'" )
        return self.putRequest( request )

    # # PHASE THREE - update Waiting#SourceSE FTSFiles
    if toUpdate:
      log.info( "==> found %s possible FTSFiles to update..." % ( len( toUpdate ) ) )
      byTarget = {}
      for ftsFile in toUpdate:
        if ftsFile.TargetSE not in byTarget:
          byTarget.setdefault( ftsFile.TargetSE, [] )
        byTarget[ftsFile.TargetSE].append( ftsFile.FileID )
      for targetSE, fileIDList in byTarget.items():
        update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList )
        if not update["OK"]:
          log.error( "update FTSFiles failed: %s" % update["Message"] )
          continue

    # # PHASE FOUR - add 'RegisterReplica' Operations
    if toRegister:
      log.info( "==> found %s Files waiting for registration, adding 'RegisterReplica' operations" )
      registerFiles = self.__register( request, operation, toRegister )
      if not registerFiles["OK"]:
        log.error( "unable to create 'RegisterReplica' operations: %s" % registerFiles["Message"] )
      if request.Status == "Waiting":
        log.info( "request is in 'Waiting' state, will put it back to RMS" )
        return self.putRequest( request )

    # # PHASE FIVE - reschedule operation files
    if toReschedule:
      log.info( "==> found %s Files to reschedule" % len( toReschedule ) )
      rescheduleFiles = self.__reschedule( request, operation, toReschedule )
      if not rescheduleFiles["OK"]:
        log.error( rescheduleFiles["Message"] )
      if request.Status == "Waiting":
        log.info( "request is in 'Waiting' state, will put it back to ReqDB" )
        return self.putRequest( request )

    # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs
    ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, [ "Waiting" ] )
    if not ftsFiles["OK"]:
      log.error( ftsFiles["Message"] )
    else:
      retryIds = list( set ( [ ftsFile.FTSFileID for ftsFile in toSubmit ] ) )
      for ftsFile in ftsFiles["Value"]:
        if ftsFile.FTSFileID not in retryIds:
          toSubmit.append( ftsFile )
          retryIds.append( ftsFile.FTSFileID )

    # # submit new ftsJobs
    if operation.Status == "Scheduled" and toSubmit:
      log.info( "==> found %s FTSFiles to submit" % len( toSubmit ) )
      submit = self.__submit( request, operation, toSubmit )
      if not submit["OK"]:
        log.error( submit["Message"] )
      else:
        ftsJobs += submit["Value"]

    # # status change? - put back request
    if request.Status != "Scheduled":
      put = self.putRequest( request )
      if not put["OK"]:
        log.error( "unable to put back request: %s" % put["Message"] )
        return put

    # #  put back jobs
    if ftsJobs:
      putJobs = self.putFTSJobs( ftsJobs )
      if not putJobs["OK"]:
        log.error( "unable to put back FTSJobs: %s" % putJobs["Message"] )
        return putJobs

    return S_OK()

  def __reschedule( self, request, operation, toReschedule ):
    """ reschedule list of :toReschedule: files in request for operation :operation:

    :param Request request:
    :param Operation operation:
    :param list toReschedule: list of FTSFiles
    """
    log = self.log.getSubLogger( "%s/reschedule" % request.RequestName )
    log.info( "found %s files to reschedule" % len( toReschedule ) )

    for opFile in operation:
      for ftsFile in toReschedule:
        if opFile.FileID == ftsFile.FileID:
          opFile.Status = "Waiting"

    toSchedule = []

    # # filter files
    for opFile in operation.getWaitingFilesList():

      replicas = self.__filterReplicas( opFile )
      if not replicas["OK"]:
        continue
      replicas = replicas["Value"]

      if not replicas["Valid"] and replicas["Banned"]:
        log.warn( "unable to schedule '%s', replicas only at banned SEs" % opFile.LFN )
        continue

      validReplicas = replicas["Valid"]
      bannedReplicas = replicas["Banned"]

      if not validReplicas and bannedReplicas:
        log.warn( "unable to schedule '%s', replicas only at banned SEs" % opFile.LFN )
        continue

      if validReplicas:
        validTargets = list( set( operation.targetSEList ) - set( validReplicas ) )
        if not validTargets:
          log.info( "file %s is already present at all targets" % opFile.LFN )
          opFile.Status = "Done"
          continue
        toSchedule.append( ( opFile.toJSON()["Value"], validReplicas, validTargets ) )

    # # do real schedule here
    if toSchedule:

      ftsSchedule = self.ftsClient().ftsSchedule( request.RequestID,
                                                  operation.OperationID,
                                                  toSchedule )
      if not ftsSchedule["OK"]:
        self.log.error( ftsSchedule["Message"] )
        return ftsSchedule

      ftsSchedule = ftsSchedule["Value"]
      for fileID in ftsSchedule["Successful"]:
        for opFile in operation:
          if fileID == opFile.FileID:
            opFile.Status = "Scheduled"

      for fileID, reason in ftsSchedule["Failed"]:
        for opFile in operation:
          if fileID == opFile.FileID:
            opFile.Error = reason

    return S_OK()


  def __submit( self, request, operation, toSubmit ):
    """ create and submit new FTSJobs using list of FTSFiles

    :param Request request: ReqDB.Request instance
    :param list ftsFiles: list of FTSFile instances

    :return: [ FTSJob, FTSJob, ...]
    """
    log = self.log.getSubLogger( "%s/submit" % request.RequestName )

    bySourceAndTarget = {}
    for ftsFile in toSubmit:
      if ftsFile.SourceSE not in bySourceAndTarget:
        bySourceAndTarget.setdefault( ftsFile.SourceSE, {} )
      if ftsFile.TargetSE not in bySourceAndTarget[ftsFile.SourceSE]:
        bySourceAndTarget[ftsFile.SourceSE].setdefault( ftsFile.TargetSE, [] )
      bySourceAndTarget[ftsFile.SourceSE][ftsFile.TargetSE].append( ftsFile )

    ftsJobs = []

    for source, targetDict in bySourceAndTarget.items():

      for target, ftsFileList in targetDict.items():

        log.info( "found %s files to submit from %s to %s" % ( len( ftsFileList ), source, target ) )

        route = self.__ftsGraph.findRoute( source, target )
        if not route["OK"]:
          log.error( route["Message"] )
          continue
        route = route["Value"]

        sourceRead = route.fromNode.SEs[source]["read"]
        if not sourceRead:
          log.error( "SourceSE %s is banned for reading right now" % source )
          continue

        targetWrite = route.toNode.SEs[target]["write"]
        if not targetWrite:
          log.error( "TargetSE %s is banned for writing right now" % target )
          continue

        if route.ActiveJobs > route.toNode.MaxActiveJobs:
          log.warn( "unable to submit new FTS job, max active jobs reached" )
          continue

        # # create FTSJob
        ftsJob = FTSJob()
        ftsJob.RequestID = request.RequestID
        ftsJob.OperationID = operation.OperationID
        ftsJob.SourceSE = source
        ftsJob.TargetSE = target

        sourceSE = self.getSE( source )
        sourceToken = sourceSE.getStorageParameters( "SRM2" )
        if not sourceToken["OK"]:
          log.error( "unable to get sourceSE '%s' parameters: %s" % ( source, sourceToken["Message"] ) )
          continue
        ftsJob.SourceToken = sourceToken["Value"].get( "SpaceToken", "" )

        targetSE = self.getSE( target )
        targetToken = targetSE.getStorageParameters( "SRM2" )
        if not targetToken["OK"]:
          log.error( "unable to get targetSE '%s' parameters: %s" % ( target, targetToken["Message"] ) )
          continue
        ftsJob.TargetToken = targetToken["Value"].get( "SpaceToken", "" )

        ftsJob.FTSServer = route.toNode.FTSServer

        for ftsFile in ftsFileList:
          ftsFile.Attempt += 1
          ftsFile.Error = ""
          ftsJob.addFile( ftsFile )

        submit = ftsJob.submitFTS2( self.STAGE_FILES )
        if not submit["OK"]:
          log.error( "unable to submit FTSJob: %s" % submit["Message"] )
          continue

        log.info( "FTSJob '%s'@'%s' has been submitted" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) )

        # # update statuses for job files
        for ftsFile in ftsJob:
          ftsFile.FTSGUID = ftsJob.FTSGUID
          ftsFile.Status = "Submitted"
          ftsFile.Attempt += 1

        # # update graph route
        try:
          self.updateLock().acquire()
          route.ActiveJobs += 1
        finally:
          self.updateLock().release()

        ftsJobs.append( ftsJob )

    log.info( "%s new FTSJobs have been submitted" % len( ftsJobs ) )
    return S_OK( ftsJobs )

  def __monitorJob( self, request, ftsJob ):
    """ execute FTSJob.monitorFTS2 for a given :ftsJob:
        if ftsJob is in a final state, finalize it

    :param Request request: ReqDB.Request instance
    :param FTSJob ftsJob: FTSDB.FTSJob instance
    """
    log = self.log.getSubLogger( "%s/monitor/%s" % ( request.RequestName, ftsJob.FTSGUID ) )
    log.info( "FTSJob '%s'@'%s'" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) )

    # # this will be returned
    ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] )

    monitor = ftsJob.monitorFTS2()
    if not monitor["OK"]:
      gMonitor.addMark( "FTSMonitorFail", 1 )
      log.error( monitor["Message"] )
      if "getTransferJobSummary2: Not authorised to query request" in monitor["Message"]:
        log.error( "FTSJob not known (expired on server?)" )
        for ftsFile in ftsJob:
          ftsFile.Status = "Waiting"
          ftsFilesDict["toSubmit"] = ftsFile
        return S_OK( ftsFilesDict )
      return monitor

    monitor = monitor["Value"]
    log.info( "FTSJob Status = %s Completeness = %s" % ( ftsJob.Status, ftsJob.Completeness ) )

    # # monitor status change
    gMonitor.addMark( "FTSJobs%s" % ftsJob.Status, 1 )

    if ftsJob.Status in FTSJob.FINALSTATES:
      finalizeFTSJob = self.__finalizeFTSJob( request, ftsJob )
      if not finalizeFTSJob["OK"]:
        log.error( finalizeFTSJob["Message"] )
        return finalizeFTSJob
      ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, finalizeFTSJob["Value"] )

    return S_OK( ftsFilesDict )

  def __finalizeFTSJob( self, request, ftsJob ):
    """ finalize FTSJob

    :param Request request: ReqDB.Request instance
    :param FTSJob ftsJob: FTSDB.FTSJob instance
    """
    log = self.log.getSubLogger( "%s/monitor/%s/finalize" % ( request.RequestName, ftsJob.FTSJobID ) )
    log.info( "finalizing FTSJob %s@%s" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) )

    # # this will be returned
    ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] )

    monitor = ftsJob.monitorFTS2( full = True )
    if not monitor["OK"]:
      log.error( monitor["Message"] )
      return monitor

    # # split FTSFiles to different categories
    processFiles = self.__filterFiles( ftsJob )
    if not processFiles["OK"]:
      log.error( processFiles["Message"] )
      return processFiles
    ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, processFiles["Value"] )

    # # send accounting record for this job
    self.__sendAccounting( ftsJob, request.OwnerDN )

    # # update graph - remove this job from graph
    route = self.__ftsGraph.findRoute( ftsJob.SourceSE, ftsJob.TargetSE )
    if route["OK"]:
      try:
        self.updateLock().acquire()
        route["Value"].ActiveJobs -= 1
      finally:
        self.updateLock().release()

    log.info( "FTSJob is finalized" )

    return S_OK( ftsFilesDict )

  def __filterFiles( self, ftsJob ):
    """ process ftsFiles from finished ftsJob

    :param FTSJob ftsJob: monitored FTSJob instance
    """
    # # lists for different categories
    toUpdate = []
    toReschedule = []
    toRegister = []
    toSubmit = []
    toFail = []

    # # loop over files in fts job
    for ftsFile in ftsJob:
      # # successful files
      if ftsFile.Status == "Finished":
        if ftsFile.Error == "AddCatalogReplicaFailed":
          toRegister.append( ftsFile )
        toUpdate.append( ftsFile )
        continue
      if ftsFile.Status == "Failed":
        if ftsFile.Error == "MissingSource":
          toReschedule.append( ftsFile )
        else:
          if ftsFile.Attempt < self.MAX_ATTEMPT:
            toSubmit.append( ftsFile )
          else:
            toFail.append( ftsFile )
            ftsFile.Error = "Max attempts reached"

    return S_OK( { "toUpdate": toUpdate,
                   "toSubmit": toSubmit,
                   "toRegister": toRegister,
                   "toReschedule": toReschedule,
                   "toFail": toFail } )

  def __register( self, request, operation, toRegister ):
    """ add RegisterReplica operation

    :param Request request: request instance
    :param Operation transferOp: 'ReplicateAndRegister' operation for this FTSJob
    :param list toRegister: [ FTSDB.FTSFile, ... ] - files that failed to register
    """
    log = self.log.getSubLogger( "%s/registerFiles" % request.RequestName )

    byTarget = {}
    for ftsFile in toRegister:
      if ftsFile.TargetSE not in byTarget:
        byTarget.setdefault( ftsFile.TargetSE, [] )
      byTarget[ftsFile.TargetSE].append( ftsFile )
    log.info( "will create %s 'RegisterReplica' operations" % len( byTarget ) )

    for target, ftsFileList in byTarget.items():
      log.info( "creating 'RegisterReplica' operation for targetSE %s with %s files..." % ( target,
                                                                                            len( ftsFileList ) ) )
      registerOperation = Operation()
      registerOperation.Type = "RegisterReplica"
      registerOperation.Status = "Waiting"
      registerOperation.TargetSE = target
      targetSE = self.getSE( target )
      for ftsFile in ftsFileList:
        opFile = File()
        opFile.LFN = ftsFile.LFN
        pfn = targetSE.getPfnForProtocol( ftsFile.TargetSURL, "SRM2", withPort = False )
        if not pfn["OK"]:
          continue
        opFile.PFN = pfn["Value"]
        registerOperation.addFile( opFile )
      request.insertBefore( registerOperation, operation )

    return S_OK()

  @staticmethod
  def __sendAccounting( ftsJob, ownerDN ):
    """ prepare and send DataOperation to AccouringDB """

    dataOp = DataOperation()
    dataOp.setStartTime( fromString( ftsJob.SubmitTime ) )
    dataOp.setEndTime( fromString( ftsJob.LastUpdate ) )

    accountingDict = dict()
    accountingDict["OperationType"] = "ReplicateAndRegister"

    username = getUsernameForDN( ownerDN )
    if not username["OK"]:
      username = ownerDN
    else:
      username = username["Value"]

    accountingDict["User"] = username
    accountingDict["Protocol"] = "FTS3" if 'fts3' in ftsJob.FTSServer.lower() else 'FTS'
    accountingDict['ExecutionSite'] = ftsJob.FTSServer

    accountingDict['RegistrationTime'] = ftsJob._regTime
    accountingDict['RegistrationOK'] = ftsJob._regSuccess
    accountingDict['RegistrationTotal'] = ftsJob._regTotal

    accountingDict["TransferOK"] = len( [ f for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES ] )
    accountingDict["TransferTotal"] = len( ftsJob )
    accountingDict["TransferSize"] = ftsJob.Size - ftsJob.FailedSize
    accountingDict["FinalStatus"] = ftsJob.Status
    accountingDict["Source"] = ftsJob.SourceSE
    accountingDict["Destination"] = ftsJob.TargetSE

    dt = ftsJob.LastUpdate - ftsJob.SubmitTime
    transferTime = dt.days * 86400 + dt.seconds
    accountingDict["TransferTime"] = transferTime
    # accountingDict['TransferTime'] = sum( [f._duration for f in ftsJob])
    dataOp.setValuesFromDict( accountingDict )
    dataOp.commit()

  def __checkReadyReplicas( self, request, operation ):
    """ check ready replicas for transferOperation """
    log = self.log.getSubLogger( "%s/checkReadyReplicas" % request.RequestName )

    targetSESet = set( operation.targetSEList )

    # # { LFN: [ targetSE, ... ] }
    missingReplicas = {}

    scheduledFiles = dict( [ ( opFile.LFN, opFile ) for opFile in operation
                              if opFile.Status in ( "Scheduled", "Waiting" ) ] )
    # # get replicas
    replicas = self.replicaManager().getCatalogReplicas( scheduledFiles.keys() )

    if not replicas["OK"]:
      self.log.error( replicas["Message"] )
      return replicas
    replicas = replicas["Value"]

    fullyReplicated = 0
    missingSEs = {}
    for successfulLFN in replicas["Successful"]:
      reps = set( replicas['Successful'][successfulLFN] )
      if targetSESet.issubset( reps ):
        log.info( "%s has been replicated to all targets" % successfulLFN )
        fullyReplicated += 1
        scheduledFiles[successfulLFN].Status = "Done"
      else:
        missingReplicas[successfulLFN] = sorted( targetSESet - reps )
        ses = ",".join( missingReplicas[ successfulLFN ] )
        missingSEs[ses] = missingSEs.setdefault( ses, 0 ) + 1
        log.verbose( "%s is still missing at %s" % ( successfulLFN, ses ) )
    if fullyReplicated:
      log.info( "%d new files have been replicated to all targets" % fullyReplicated )
    if missingSEs:
      for ses in missingSEs:
        log.info( "%d replicas still missing at %s" % ( missingSEs[ses], ses ) )

    reMissing = re.compile( "no such file or directory" )
    for failedLFN, errStr in replicas["Failed"].items():
      scheduledFiles[failedLFN].Error = errStr
      if reMissing.search( errStr.lower() ):
        log.error( "%s is missing, setting its status to 'Failed'" % failedLFN )
        scheduledFiles[failedLFN].Status = "Failed"
      else:
        log.warn( "unable to read replicas for %s: %s" % ( failedLFN, errStr ) )

    return S_OK( missingReplicas )

  def __filterReplicas( self, opFile ):
    """ filter out banned/invalid source SEs """
    log = self.log.getSubLogger( "filterReplicas" )

    ret = { "Valid" : [], "Banned" : [], "Bad" : [] }

    replicas = self.replicaManager().getActiveReplicas( opFile.LFN )
    if not replicas["OK"]:
      log.error( replicas["Message"] )
    reNotExists = re.compile( "not such file or directory" )
    replicas = replicas["Value"]
    failed = replicas["Failed"].get( opFile.LFN , "" )
    if reNotExists.match( failed.lower() ):
      opFile.Status = "Failed"
      opFile.Error = failed
      return S_ERROR( failed )

    replicas = replicas["Successful"][opFile.LFN] if opFile.LFN in replicas["Successful"] else {}

    for repSEName in replicas:

      repSE = self.getSE( repSEName )

      pfn = repSE.getPfnForLfn( opFile.LFN )
      if not pfn["OK"]:
        log.warn( "unable to create pfn for %s lfn: %s" % ( opFile.LFN, pfn["Message"] ) )
        ret["Banned"].append( repSEName )
        continue
      pfn = pfn["Value"]

      repSEMetadata = repSE.getFileMetadata( pfn, singleFile = True )
      if not repSEMetadata["OK"]:
        self.log.warn( repSEMetadata["Message"] )
        ret["Banned"].append( repSEName )
        continue
      repSEMetadata = repSEMetadata["Value"]

      seChecksum = repSEMetadata["Checksum"].replace( "x", "0" ).zfill( 8 ) if "Checksum" in repSEMetadata else None
      if opFile.Checksum and opFile.Checksum != seChecksum:
        self.log.warn( " %s checksum mismatch: %s %s:%s" % ( opFile.LFN,
                                                             opFile.Checksum,
                                                             repSE,
                                                             seChecksum ) )
        ret["Bad"].append( repSEName )
        continue
      # # if we're here repSE is OK
      ret["Valid"].append( repSEName )

    return S_OK( ret )
コード例 #15
0
ファイル: RemovalAgent.py プロジェクト: KrzysztofCiba/DIRAC
class RemovalAgent( AgentModule, RequestAgentMixIn ):
  """
    This Agent takes care of executing "removal" request from the RequestManagement system
  """

  def __init__( self, *args ):
    """
    Initialize the base class and define some extra data members
    """
    AgentModule.__init__( self, *args )
    self.requestDBClient = None
    self.replicaManager = None
    self.maxNumberOfThreads = 4
    self.maxRequestsInQueue = 100
    self.threadPool = None

  def initialize( self ):
    """
      Called by the framework upon startup, before any cycle (execute method bellow)
    """
    self.requestDBClient = RequestClient()
    self.replicaManager = ReplicaManager()

    gMonitor.registerActivity( "Iteration", "Agent Loops", "RemovalAgent", "Loops/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "Execute", "Request Processed", "RemovalAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "Done", "Request Completed", "RemovalAgent", "Requests/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "PhysicalRemovalAtt", "Physical removals attempted",
                               "RemovalAgent", "Removal/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "PhysicalRemovalDone", "Successful physical removals",
                               "RemovalAgent", "Removal/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "PhysicalRemovalFail", "Failed physical removals",
                               "RemovalAgent", "Removal/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "PhysicalRemovalSize", "Physically removed size",
                               "RemovalAgent", "Bytes", gMonitor.OP_ACUM )

    gMonitor.registerActivity( "ReplicaRemovalAtt", "Replica removal attempted",
                               "RemovalAgent", "Removal/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "ReplicaRemovalDone", "Successful replica removals",
                               "RemovalAgent", "Removal/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "ReplicaRemovalFail", "Failed replica removals",
                               "RemovalAgent", "Removal/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "RemoveFileAtt", "File removal attempted",
                               "RemovalAgent", "Removal/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RemoveFileDone", "File removal done",
                               "RemovalAgent", "Removal/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RemoveFileFail", "File removal failed",
                               "RemovalAgent", "Removal/min", gMonitor.OP_SUM )

    self.maxNumberOfThreads = self.am_getOption( 'NumberOfThreads', self.maxNumberOfThreads )
    self.maxRequestsInQueue = self.am_getOption( 'RequestsInQueue', self.maxRequestsInQueue )
    self.threadPool = ThreadPool( 1, self.maxNumberOfThreads, self.maxRequestsInQueue )

    # Set the ThreadPool in daemon mode to process new ThreadedJobs as they are inserted
    self.threadPool.daemonize()

    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    return S_OK()

  def execute( self ):
    """
    Fill the TreadPool with ThreadJobs
    """

    while True:
      requestExecutor = ThreadedJob( self.executeRequest )
      ret = self.threadPool.queueJob( requestExecutor )
      if not ret['OK']:
        break

    return S_OK()

  def executeRequest( self ):
    """
    Do the actual work in the Thread
    """
    ################################################
    # Get a request from request DB
    gMonitor.addMark( "Iteration", 1 )
    res = self.requestDBClient.getRequest( 'removal' )
    if not res['OK']:
      gLogger.info( "RemovalAgent.execute: Failed to get request from database." )
      return S_OK()
    elif not res['Value']:
      gLogger.info( "RemovalAgent.execute: No requests to be executed found." )
      return S_OK()
    requestString = res['Value']['RequestString']
    requestName = res['Value']['RequestName']
    sourceServer = res['Value']['Server']
    try:
      jobID = int( res['Value']['JobID'] )
    except ValueError:
      jobID = 0
    gLogger.info( "RemovalAgent.execute: Obtained request %s" % requestName )

    result = self.requestDBClient.getCurrentExecutionOrder( requestName, sourceServer )
    if result['OK']:
      currentOrder = result['Value']
    else:
      gLogger.error( 'Can not get the request execution order' )
      return S_OK( 'Can not get the request execution order' )

    oRequest = RequestContainer( request = requestString )

    ################################################
    # Find the number of sub-requests from the request
    res = oRequest.getNumSubRequests( 'removal' )
    if not res['OK']:
      errStr = "RemovalAgent.execute: Failed to obtain number of removal subrequests."
      gLogger.error( errStr, res['Message'] )
      return S_OK()
    gLogger.info( "RemovalAgent.execute: Found %s sub requests." % res['Value'] )

    ################################################
    # For all the sub-requests in the request
    modified = False
    for ind in range( res['Value'] ):
      gMonitor.addMark( "Execute", 1 )
      gLogger.info( "RemovalAgent.execute: Processing sub-request %s." % ind )
      subRequestAttributes = oRequest.getSubRequestAttributes( ind, 'removal' )['Value']
      subExecutionOrder = int( subRequestAttributes['ExecutionOrder'] )
      subStatus = subRequestAttributes['Status']
      if subStatus == 'Waiting' and subExecutionOrder <= currentOrder:
        subRequestFiles = oRequest.getSubRequestFiles( ind, 'removal' )['Value']
        operation = subRequestAttributes['Operation']

        ################################################
        #  If the sub-request is a physical removal operation
        if operation == 'physicalRemoval':
          gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation )
          diracSEs = subRequestAttributes['TargetSE'].split( ',' )
          physicalFiles = []
          pfnToLfn = {}
          for subRequestFile in subRequestFiles:
            if subRequestFile['Status'] == 'Waiting':
              pfn = str( subRequestFile['PFN'] )
              lfn = str( subRequestFile['LFN'] )
              pfnToLfn[pfn] = lfn
              physicalFiles.append( pfn )
          gMonitor.addMark( 'PhysicalRemovalAtt', len( physicalFiles ) )
          failed = {}
          errMsg = {}
          for diracSE in diracSEs:
            res = self.replicaManager.removeStorageFile( physicalFiles, diracSE )
            if res['OK']:
              for pfn in res['Value']['Failed'].keys():
                if not failed.has_key( pfn ):
                  failed[pfn] = {}
                failed[pfn][diracSE] = res['Value']['Failed'][pfn]
            else:
              errMsg[diracSE] = res['Message']
              for pfn in physicalFiles:
                if not failed.has_key( pfn ):
                  failed[pfn] = {}
                failed[pfn][diracSE] = 'Completely'
          # Now analyse the results
          failedPFNs = failed.keys()
          pfnsOK = [pfn for pfn in physicalFiles if not pfn in failedPFNs]
          gMonitor.addMark( 'PhysicalRemovalDone', len( pfnsOK ) )
          for pfn in pfnsOK:
            gLogger.info( "RemovalAgent.execute: Successfully removed %s at %s" % ( pfn, str( diracSEs ) ) )
            res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', pfnToLfn[pfn], 'Status', 'Done' )
            if not res['OK']:
              gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', pfnToLfn[pfn] ) )
            modified = True
          if failed:
            gMonitor.addMark( 'PhysicalRemovalFail', len( failedPFNs ) )
            for pfn in failedPFNs:
              for diracSE in failed[pfn].keys():
                if type( failed[pfn][diracSE] ) in StringTypes:
                  if re.search( 'no such file or directory', failed[pfn][diracSE].lower() ):
                    gLogger.info( "RemovalAgent.execute: File did not exist.", pfn )
                    res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', pfnToLfn[pfn], 'Status', 'Done' )
                    if not res['OK']:
                      gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', pfnToLfn[pfn] ) )
                    modified = True
                  else:
                    gLogger.info( "RemovalAgent.execute: Failed to remove file.", "%s at %s - %s" % ( pfn, diracSE, failed[pfn][diracSE] ) )
          if errMsg:
            for diracSE in errMsg.keys():
              errStr = "RemovalAgent.execute: Completely failed to remove replicas. At %s", diracSE
              gLogger.error( errStr, errMsg[diracSE] )


        ################################################
        #  If the sub-request is a physical removal operation
        elif operation == 'removeFile':
          gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation )
          lfns = []
          for subRequestFile in subRequestFiles:
            if subRequestFile['Status'] == 'Waiting':
              lfn = str( subRequestFile['LFN'] )
              lfns.append( lfn )
          gMonitor.addMark( 'RemoveFileAtt', len( lfns ) )
          res = self.replicaManager.removeFile( lfns )
          if res['OK']:
            gMonitor.addMark( 'RemoveFileDone', len( res['Value']['Successful'].keys() ) )
            for lfn in res['Value']['Successful'].keys():
              gLogger.info( "RemovalAgent.execute: Successfully removed %s." % lfn )
              result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' )
              if not result['OK']:
                gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) )
              modified = True
            gMonitor.addMark( 'RemoveFileFail', len( res['Value']['Failed'].keys() ) )
            for lfn in res['Value']['Failed'].keys():
              if type( res['Value']['Failed'][lfn] ) in StringTypes:
                if re.search( 'no such file or directory', res['Value']['Failed'][lfn].lower() ):
                  gLogger.info( "RemovalAgent.execute: File did not exist.", lfn )
                  result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' )
                  if not result['OK']:
                    gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) )
                  modified = True
                else:
                  gLogger.info( "RemovalAgent.execute: Failed to remove file:",
                                "%s %s" % ( lfn, res['Value']['Failed'][lfn] ) )
          else:
            gMonitor.addMark( 'RemoveFileFail', len( lfns ) )
            errStr = "RemovalAgent.execute: Completely failed to remove files files."
            gLogger.error( errStr, res['Message'] )

        ################################################
        #  If the sub-request is a physical removal operation
        elif operation == 'replicaRemoval':
          gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation )
          diracSEs = subRequestAttributes['TargetSE'].split( ',' )
          lfns = []
          for subRequestFile in subRequestFiles:
            if subRequestFile['Status'] == 'Waiting':
              lfn = str( subRequestFile['LFN'] )
              lfns.append( lfn )
          gMonitor.addMark( 'ReplicaRemovalAtt', len( lfns ) )

          failed = {}
          errMsg = {}
          for diracSE in diracSEs:
            res = self.replicaManager.removeReplica( diracSE, lfns )
            if res['OK']:
              for lfn in res['Value']['Failed'].keys():
                if not failed.has_key( lfn ):
                  failed[lfn] = {}
                failed[lfn][diracSE] = res['Value']['Failed'][lfn]
            else:
              errMsg[diracSE] = res['Message']
              for lfn in lfns:
                if not failed.has_key( lfn ):
                  failed[lfn] = {}
                failed[lfn][diracSE] = 'Completely'
          # Now analyse the results
          failedLFNs = failed.keys()
          lfnsOK = [lfn for lfn in lfns if not lfn in failedLFNs]
          gMonitor.addMark( 'ReplicaRemovalDone', len( lfnsOK ) )
          for lfn in lfnsOK:
            gLogger.info( "RemovalAgent.execute: Successfully removed %s at %s" % ( lfn, str( diracSEs ) ) )
            res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' )
            if not res['OK']:
              gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) )
            modified = True
          if failed:
            gMonitor.addMark( 'PhysicalRemovalFail', len( failedLFNs ) )
            for lfn in failedLFNs:
              for diracSE in failed[lfn].keys():
                if type( failed[lfn][diracSE] ) in StringTypes:
                  if re.search( 'no such file or directory', failed[lfn][diracSE].lower() ):
                    gLogger.info( "RemovalAgent.execute: File did not exist.", lfn )
                    res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' )
                    if not res['OK']:
                      gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) )
                    modified = True
                  else:
                    gLogger.info( "RemovalAgent.execute: Failed to remove file.", "%s at %s - %s" % ( lfn, diracSE, failed[lfn][diracSE] ) )
          if errMsg:
            for diracSE in errMsg.keys():
              errStr = "RemovalAgent.execute: Completely failed to remove replicas. At %s", diracSE
              gLogger.error( errStr, errMsg[diracSE] )

        ################################################
        #  If the sub-request is a request to the online system to retransfer
        elif operation == 'reTransfer':
          gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation )
          diracSE = subRequestAttributes['TargetSE']
          for subRequestFile in subRequestFiles:
            if subRequestFile['Status'] == 'Waiting':
              pfn = str( subRequestFile['PFN'] )
              lfn = str( subRequestFile['LFN'] )
              res = self.replicaManager.onlineRetransfer( diracSE, pfn )
              if res['OK']:
                if res['Value']['Successful'].has_key( pfn ):
                  gLogger.info( "RemovalAgent.execute: Successfully requested retransfer of %s." % pfn )
                  result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' )
                  if not result['OK']:
                    gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) )
                  modified = True
                else:
                  errStr = "RemovalAgent.execute: Failed to request retransfer."
                  gLogger.error( errStr, "%s %s %s" % ( pfn, diracSE, res['Value']['Failed'][pfn] ) )
              else:
                errStr = "RemovalAgent.execute: Completely failed to request retransfer."
                gLogger.error( errStr, res['Message'] )
            else:
              gLogger.info( "RemovalAgent.execute: File already completed." )

        ################################################
        #  If the sub-request is none of the above types
        else:
          gLogger.error( "RemovalAgent.execute: Operation not supported.", operation )

        ################################################
        #  Determine whether there are any active files
        if oRequest.isSubRequestEmpty( ind, 'removal' )['Value']:
          oRequest.setSubRequestStatus( ind, 'removal', 'Done' )
          gMonitor.addMark( "Done", 1 )

      ################################################
      #  If the sub-request is already in terminal state
      else:
        gLogger.info( "RemovalAgent.execute:",
                      "Sub-request %s is status '%s' and not to be executed." %
                      ( ind, subRequestAttributes['Status'] ) )

    ################################################
    #  Generate the new request string after operation
    requestString = oRequest.toXML()['Value']
    res = self.requestDBClient.updateRequest( requestName, requestString, sourceServer )

    if modified and jobID:
      result = self.finalizeRequest( requestName, jobID, sourceServer )

    return S_OK()

  def finalize( self ):
    """
    Called by the Agent framework to cleanly end execution.
    In this case this module will wait until all pending ThreadedJbos in the
    ThreadPool get executed
    """

    self.threadPool.processAllResults()
    return S_OK()
コード例 #16
0
ファイル: RemovalAgent.py プロジェクト: zhangxiaomei/DIRAC
class RemovalAgent(AgentModule, RequestAgentMixIn):
    """
    This Agent takes care of executing "removal" request from the RequestManagement system
  """
    def __init__(self, *args):
        """
    Initialize the base class and define some extra data members
    """
        AgentModule.__init__(self, *args)
        self.requestDBClient = None
        self.replicaManager = None
        self.maxNumberOfThreads = 4
        self.maxRequestsInQueue = 100
        self.threadPool = None
        self.timeOutCounter = 0
        self.pendingRequests = True

    def initialize(self):
        """
      Called by the framework upon startup, before any cycle (execute method bellow)
    """
        self.requestDBClient = RequestClient()
        # the RequestAgentMixIn needs the capitalized version, until is is fixed keep this.
        self.RequestDBClient = self.requestDBClient
        self.replicaManager = ReplicaManager()

        gMonitor.registerActivity("Iteration", "Agent Loops", "RemovalAgent",
                                  "Loops/min", gMonitor.OP_SUM)
        gMonitor.registerActivity("Execute", "Request Processed",
                                  "RemovalAgent", "Requests/min",
                                  gMonitor.OP_SUM)
        gMonitor.registerActivity("Done", "Request Completed", "RemovalAgent",
                                  "Requests/min", gMonitor.OP_SUM)

        gMonitor.registerActivity("PhysicalRemovalAtt",
                                  "Physical removals attempted",
                                  "RemovalAgent", "Removal/min",
                                  gMonitor.OP_SUM)
        gMonitor.registerActivity("PhysicalRemovalDone",
                                  "Successful physical removals",
                                  "RemovalAgent", "Removal/min",
                                  gMonitor.OP_SUM)
        gMonitor.registerActivity("PhysicalRemovalFail",
                                  "Failed physical removals", "RemovalAgent",
                                  "Removal/min", gMonitor.OP_SUM)
        gMonitor.registerActivity("PhysicalRemovalSize",
                                  "Physically removed size", "RemovalAgent",
                                  "Bytes", gMonitor.OP_ACUM)

        gMonitor.registerActivity("ReplicaRemovalAtt",
                                  "Replica removal attempted", "RemovalAgent",
                                  "Removal/min", gMonitor.OP_SUM)
        gMonitor.registerActivity("ReplicaRemovalDone",
                                  "Successful replica removals",
                                  "RemovalAgent", "Removal/min",
                                  gMonitor.OP_SUM)
        gMonitor.registerActivity("ReplicaRemovalFail",
                                  "Failed replica removals", "RemovalAgent",
                                  "Removal/min", gMonitor.OP_SUM)

        gMonitor.registerActivity("RemoveFileAtt", "File removal attempted",
                                  "RemovalAgent", "Removal/min",
                                  gMonitor.OP_SUM)
        gMonitor.registerActivity("RemoveFileDone", "File removal done",
                                  "RemovalAgent", "Removal/min",
                                  gMonitor.OP_SUM)
        gMonitor.registerActivity("RemoveFileFail", "File removal failed",
                                  "RemovalAgent", "Removal/min",
                                  gMonitor.OP_SUM)

        self.maxNumberOfThreads = self.am_getOption('NumberOfThreads',
                                                    self.maxNumberOfThreads)
        self.maxRequestsInQueue = self.am_getOption('RequestsInQueue',
                                                    self.maxRequestsInQueue)
        self.threadPool = ThreadPool(1, self.maxNumberOfThreads,
                                     self.maxRequestsInQueue)

        # Set the ThreadPool in daemon mode to process new ThreadedJobs as they are inserted
        self.threadPool.daemonize()

        self.maxRequests = self.am_getOption('MaxRequestsPerCycle', 1200.)

        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption('shifterProxy', 'DataManager')

        return S_OK()

    def execute(self):
        """
    Fill the TreadPool with ThreadJobs
    """
        self.pendingRequests = True
        self.maxRequests = min(
            10000., self.am_getOption('MaxRequestsPerCycle', self.maxRequests))
        requestCounter = 0
        while self.pendingRequests:
            if requestCounter > self.maxRequests:
                break
            requestCounter += 1
            requestExecutor = ThreadedJob(self.executeRequest)
            ret = self.threadPool.queueJob(requestExecutor)
            if not ret['OK']:
                break
            time.sleep(0.1)

        if self.timeOutCounter:
            gLogger.error('Timeouts during removal execution:',
                          self.timeOutCounter)

        return S_OK()

    def executeRequest(self):
        """
    Do the actual work in the Thread
    """
        ################################################
        # Get a request from request DB
        gMonitor.addMark("Iteration", 1)
        res = self.requestDBClient.getRequest('removal')
        if not res['OK']:
            gLogger.info(
                "RemovalAgent.execute: Failed to get request from database.")
            return S_OK()
        elif not res['Value']:
            gLogger.info(
                "RemovalAgent.execute: No requests to be executed found.")
            self.pendingRequests = False
            return S_OK()
        requestString = res['Value']['RequestString']
        requestName = res['Value']['RequestName']
        sourceServer = res['Value']['Server']

        jobID = 0
        try:
            jobID = int(res['Value']['JobID'])
        except:
            gLogger.warn(
                "RemovalAgent.execute: JobID not present or malformed in request '%s', will use 0 instead."
                % requestName)

        gLogger.info("RemovalAgent.execute: Obtained request %s" % requestName)

        try:

            result = self.requestDBClient.getCurrentExecutionOrder(
                requestName, sourceServer)
            if result['OK']:
                currentOrder = result['Value']
            else:
                gLogger.error('Can not get the request execution order')
                self.requestDBClient.updateRequest(requestName, requestString,
                                                   sourceServer)
                return S_OK('Can not get the request execution order')

            oRequest = RequestContainer(request=requestString)

            ################################################
            # Find the number of sub-requests from the request
            res = oRequest.getNumSubRequests('removal')
            if not res['OK']:
                errStr = "RemovalAgent.execute: Failed to obtain number of removal subrequests."
                gLogger.error(errStr, res['Message'])
                return S_OK()
            gLogger.info("RemovalAgent.execute: Found %s sub requests." %
                         res['Value'])

            ################################################
            # For all the sub-requests in the request
            modified = False
            for ind in range(res['Value']):
                gMonitor.addMark("Execute", 1)
                gLogger.info(
                    "RemovalAgent.execute: Processing sub-request %s." % ind)
                subRequestAttributes = oRequest.getSubRequestAttributes(
                    ind, 'removal')['Value']
                subExecutionOrder = int(subRequestAttributes['ExecutionOrder'])
                subStatus = subRequestAttributes['Status']
                if subStatus == 'Waiting' and subExecutionOrder <= currentOrder:
                    subRequestFiles = oRequest.getSubRequestFiles(
                        ind, 'removal')['Value']
                    operation = subRequestAttributes['Operation']

                    ################################################
                    #  If the sub-request is a physical removal operation
                    if operation == 'physicalRemoval':
                        gLogger.info(
                            "RemovalAgent.execute: Attempting to execute %s sub-request."
                            % operation)
                        diracSEs = subRequestAttributes['TargetSE'].split(',')
                        physicalFiles = []
                        pfnToLfn = {}
                        for subRequestFile in subRequestFiles:
                            if subRequestFile['Status'] == 'Waiting':
                                pfn = str(subRequestFile['PFN'])
                                lfn = str(subRequestFile['LFN'])
                                pfnToLfn[pfn] = lfn
                                physicalFiles.append(pfn)
                        gMonitor.addMark('PhysicalRemovalAtt',
                                         len(physicalFiles))
                        failed = {}
                        errMsg = {}
                        for diracSE in diracSEs:
                            res = self.replicaManager.removeStorageFile(
                                physicalFiles, diracSE)
                            if res['OK']:
                                for pfn in res['Value']['Failed'].keys():
                                    if not failed.has_key(pfn):
                                        failed[pfn] = {}
                                    failed[pfn][diracSE] = res['Value'][
                                        'Failed'][pfn]
                            else:
                                errMsg[diracSE] = res['Message']
                                for pfn in physicalFiles:
                                    if not failed.has_key(pfn):
                                        failed[pfn] = {}
                                    failed[pfn][diracSE] = 'Completely'
                        # Now analyse the results
                        failedPFNs = failed.keys()
                        pfnsOK = [
                            pfn for pfn in physicalFiles
                            if not pfn in failedPFNs
                        ]
                        gMonitor.addMark('PhysicalRemovalDone', len(pfnsOK))
                        for pfn in pfnsOK:
                            gLogger.info(
                                "RemovalAgent.execute: Successfully removed %s at %s"
                                % (pfn, str(diracSEs)))
                            res = oRequest.setSubRequestFileAttributeValue(
                                ind, 'removal', pfnToLfn[pfn], 'Status',
                                'Done')
                            if not res['OK']:
                                gLogger.error(
                                    "RemovalAgent.execute: Error setting status to %s for %s"
                                    % ('Done', pfnToLfn[pfn]))
                            modified = True
                        if failed:
                            gMonitor.addMark('PhysicalRemovalFail',
                                             len(failedPFNs))
                            for pfn in failedPFNs:
                                for diracSE in failed[pfn].keys():
                                    if type(failed[pfn]
                                            [diracSE]) in StringTypes:
                                        if re.search(
                                                'no such file or directory',
                                                failed[pfn][diracSE].lower()):
                                            gLogger.info(
                                                "RemovalAgent.execute: File did not exist.",
                                                pfn)
                                            res = oRequest.setSubRequestFileAttributeValue(
                                                ind, 'removal', pfnToLfn[pfn],
                                                'Status', 'Done')
                                            if not res['OK']:
                                                gLogger.error(
                                                    "RemovalAgent.execute: Error setting status to %s for %s"
                                                    % ('Done', pfnToLfn[pfn]))
                                            modified = True
                                        else:
                                            gLogger.info(
                                                "RemovalAgent.execute: Failed to remove file.",
                                                "%s at %s - %s" %
                                                (pfn, diracSE,
                                                 failed[pfn][diracSE]))
                        if errMsg:
                            for diracSE in errMsg.keys():
                                errStr = "RemovalAgent.execute: Completely failed to remove replicas. At %s", diracSE
                                gLogger.error(errStr, errMsg[diracSE])

                    ################################################
                    #  If the sub-request is a physical removal operation
                    elif operation == 'removeFile':
                        gLogger.info(
                            "RemovalAgent.execute: Attempting to execute %s sub-request."
                            % operation)
                        lfns = []
                        for subRequestFile in subRequestFiles:
                            if subRequestFile['Status'] == 'Waiting':
                                lfn = str(subRequestFile['LFN'])
                                lfns.append(lfn)
                        gMonitor.addMark('RemoveFileAtt', len(lfns))
                        res = self.replicaManager.removeFile(lfns)
                        if res['OK']:
                            gMonitor.addMark(
                                'RemoveFileDone',
                                len(res['Value']['Successful'].keys()))
                            for lfn in res['Value']['Successful'].keys():
                                gLogger.info(
                                    "RemovalAgent.execute: Successfully removed %s."
                                    % lfn)
                                result = oRequest.setSubRequestFileAttributeValue(
                                    ind, 'removal', lfn, 'Status', 'Done')
                                if not result['OK']:
                                    gLogger.error(
                                        "RemovalAgent.execute: Error setting status to %s for %s"
                                        % ('Done', lfn))
                                modified = True
                            gMonitor.addMark(
                                'RemoveFileFail',
                                len(res['Value']['Failed'].keys()))
                            for lfn in res['Value']['Failed'].keys():
                                if type(res['Value']['Failed']
                                        [lfn]) in StringTypes:
                                    if re.search(
                                            'no such file or directory',
                                            res['Value']['Failed']
                                        [lfn].lower()):
                                        gLogger.info(
                                            "RemovalAgent.execute: File did not exist.",
                                            lfn)
                                        result = oRequest.setSubRequestFileAttributeValue(
                                            ind, 'removal', lfn, 'Status',
                                            'Done')
                                        if not result['OK']:
                                            gLogger.error(
                                                "RemovalAgent.execute: Error setting status to %s for %s"
                                                % ('Done', lfn))
                                        modified = True
                                    else:
                                        gLogger.info(
                                            "RemovalAgent.execute: Failed to remove file:",
                                            "%s %s" %
                                            (lfn, res['Value']['Failed'][lfn]))
                        else:
                            gMonitor.addMark('RemoveFileFail', len(lfns))
                            errStr = "RemovalAgent.execute: Completely failed to remove files files."
                            gLogger.error(errStr, res['Message'])

                    ################################################
                    #  If the sub-request is a physical removal operation
                    elif operation == 'replicaRemoval':
                        gLogger.info(
                            "RemovalAgent.execute: Attempting to execute %s sub-request."
                            % operation)
                        diracSEs = subRequestAttributes['TargetSE'].split(',')
                        lfns = []
                        for subRequestFile in subRequestFiles:
                            if subRequestFile['Status'] == 'Waiting':
                                lfn = str(subRequestFile['LFN'])
                                lfns.append(lfn)
                        gMonitor.addMark('ReplicaRemovalAtt', len(lfns))

                        failed = {}
                        errMsg = {}
                        for diracSE in diracSEs:
                            res = self.replicaManager.removeReplica(
                                diracSE, lfns)
                            if res['OK']:
                                for lfn in res['Value']['Failed'].keys():
                                    errorMessage = str(
                                        res['Value']['Failed'][lfn])
                                    if errorMessage.find(
                                            'Write access not permitted for this credential.'
                                    ) != -1:
                                        if self.__getProxyAndRemoveReplica(
                                                diracSE, lfn):
                                            continue
                                    if errorMessage.find(
                                            'seconds timeout for "__gfal_wrapper" call'
                                    ) != -1:
                                        self.timeOutCounter += 1
                                    if not failed.has_key(lfn):
                                        failed[lfn] = {}
                                    failed[lfn][diracSE] = res['Value'][
                                        'Failed'][lfn]
                            else:
                                errMsg[diracSE] = res['Message']
                                for lfn in lfns:
                                    if not failed.has_key(lfn):
                                        failed[lfn] = {}
                                    failed[lfn][diracSE] = 'Completely'
                        # Now analyse the results
                        failedLFNs = failed.keys()
                        lfnsOK = [lfn for lfn in lfns if not lfn in failedLFNs]
                        gMonitor.addMark('ReplicaRemovalDone', len(lfnsOK))
                        for lfn in lfnsOK:
                            gLogger.info(
                                "RemovalAgent.execute: Successfully removed %s at %s"
                                % (lfn, str(diracSEs)))
                            res = oRequest.setSubRequestFileAttributeValue(
                                ind, 'removal', lfn, 'Status', 'Done')
                            if not res['OK']:
                                gLogger.error(
                                    "RemovalAgent.execute: Error setting status to %s for %s"
                                    % ('Done', lfn))
                            modified = True
                        if failed:
                            gMonitor.addMark('PhysicalRemovalFail',
                                             len(failedLFNs))
                            for lfn in failedLFNs:
                                for diracSE in failed[lfn].keys():
                                    if type(failed[lfn]
                                            [diracSE]) in StringTypes:
                                        if re.search(
                                                'no such file or directory',
                                                failed[lfn][diracSE].lower()):
                                            gLogger.info(
                                                "RemovalAgent.execute: File did not exist.",
                                                lfn)
                                            res = oRequest.setSubRequestFileAttributeValue(
                                                ind, 'removal', lfn, 'Status',
                                                'Done')
                                            if not res['OK']:
                                                gLogger.error(
                                                    "RemovalAgent.execute: Error setting status to %s for %s"
                                                    % ('Done', lfn))
                                            modified = True
                                        else:
                                            gLogger.info(
                                                "RemovalAgent.execute: Failed to remove file.",
                                                "%s at %s - %s" %
                                                (lfn, diracSE,
                                                 failed[lfn][diracSE]))
                        if errMsg:
                            for diracSE in errMsg.keys():
                                errStr = "RemovalAgent.execute: Completely failed to remove replicas. At %s", diracSE
                                gLogger.error(errStr, errMsg[diracSE])

                    ################################################
                    #  If the sub-request is a request to the online system to retransfer
                    elif operation == 'reTransfer':
                        gLogger.info(
                            "RemovalAgent.execute: Attempting to execute %s sub-request."
                            % operation)
                        diracSE = subRequestAttributes['TargetSE']
                        for subRequestFile in subRequestFiles:
                            if subRequestFile['Status'] == 'Waiting':
                                pfn = str(subRequestFile['PFN'])
                                lfn = str(subRequestFile['LFN'])
                                res = self.replicaManager.onlineRetransfer(
                                    diracSE, pfn)
                                if res['OK']:
                                    if res['Value']['Successful'].has_key(pfn):
                                        gLogger.info(
                                            "RemovalAgent.execute: Successfully requested retransfer of %s."
                                            % pfn)
                                        result = oRequest.setSubRequestFileAttributeValue(
                                            ind, 'removal', lfn, 'Status',
                                            'Done')
                                        if not result['OK']:
                                            gLogger.error(
                                                "RemovalAgent.execute: Error setting status to %s for %s"
                                                % ('Done', lfn))
                                        modified = True
                                    else:
                                        errStr = "RemovalAgent.execute: Failed to request retransfer."
                                        gLogger.error(
                                            errStr, "%s %s %s" %
                                            (pfn, diracSE,
                                             res['Value']['Failed'][pfn]))
                                else:
                                    errStr = "RemovalAgent.execute: Completely failed to request retransfer."
                                    gLogger.error(errStr, res['Message'])
                            else:
                                gLogger.info(
                                    "RemovalAgent.execute: File already completed."
                                )

                    ################################################
                    #  If the sub-request is none of the above types
                    else:
                        gLogger.error(
                            "RemovalAgent.execute: Operation not supported.",
                            operation)

                    ################################################
                    #  Determine whether there are any active files
                    if oRequest.isSubRequestEmpty(ind, 'removal')['Value']:
                        oRequest.setSubRequestStatus(ind, 'removal', 'Done')
                        gMonitor.addMark("Done", 1)

                ################################################
                #  If the sub-request is already in terminal state
                else:
                    gLogger.info(
                        "RemovalAgent.execute:",
                        "Sub-request %s is status '%s' and not to be executed."
                        % (ind, subRequestAttributes['Status']))

            ################################################
            #  Generate the new request string after operation
            newrequestString = oRequest.toXML()['Value']
        except:
            # if something fails return the original request back to the server
            res = self.requestDBClient.updateRequest(requestName,
                                                     requestString,
                                                     sourceServer)
            return S_OK()

        res = self.requestDBClient.updateRequest(requestName, newrequestString,
                                                 sourceServer)

        if modified and jobID:
            result = self.finalizeRequest(requestName, jobID, sourceServer)

        return S_OK()

    def __getProxyAndRemoveReplica(self, diracSE, lfn):
        """
    get a proxy from the owner of the file and try to remove it
    returns True if it succeeds, False otherwise
    """

        result = self.replicaManager.getCatalogDirectoryMetadata(
            lfn, singleFile=True)
        if not result['OK']:
            gLogger.error("Could not get metadata info", result['Message'])
            return False
        ownerRole = result['Value']['OwnerRole']
        ownerDN = result['Value']['OwnerDN']
        if ownerRole[0] != "/":
            ownerRole = "/%s" % ownerRole

        userProxy = ''
        for ownerGroup in Registry.getGroupsWithVOMSAttribute(ownerRole):
            result = gProxyManager.downloadVOMSProxy(
                ownerDN,
                ownerGroup,
                limited=True,
                requiredVOMSAttribute=ownerRole)
            if not result['OK']:
                gLogger.verbose(
                    'Failed to retrieve voms proxy for %s : %s:' %
                    (ownerDN, ownerRole), result['Message'])
                continue
            userProxy = result['Value']
            gLogger.verbose("Got proxy for %s@%s [%s]" %
                            (ownerDN, ownerGroup, ownerRole))
            break
        if not userProxy:
            return False

        result = userProxy.dumpAllToFile()
        if not result['OK']:
            gLogger.verbose(result['Message'])
            return False

        upFile = result['Value']
        prevProxyEnv = os.environ['X509_USER_PROXY']
        os.environ['X509_USER_PROXY'] = upFile

        try:
            res = self.replicaManager.removeReplica(diracSE, lfn)
            if res['OK'] and lfn in res['Value']['Successful']:
                gLogger.verbose('Removed %s from %s' % (lfn, diracSE))
                return True
        finally:
            os.environ['X509_USER_PROXY'] = prevProxyEnv
            os.unlink(upFile)

        return False

    def finalize(self):
        """
    Called by the Agent framework to cleanly end execution.
    In this case this module will wait until all pending ThreadedJbos in the
    ThreadPool get executed
    """

        self.threadPool.processAllResults()
        return S_OK()
コード例 #17
0
ファイル: GatewayService.py プロジェクト: KrzysztofCiba/DIRAC
class GatewayService( Service ):

  GATEWAY_NAME = "Framework/Gateway"

  def __init__( self ):
    Service.__init__( self, GatewayService.GATEWAY_NAME )
    self.__delegatedCredentials = DictCache()
    self.__transferBytesLimit = 1024 * 1024 * 100

  def initialize( self ):
    #Build the URLs
    self._url = self._cfg.getURL()
    if not self._url:
      return S_ERROR( "Could not build service URL for %s" % GatewayService.GATEWAY_NAME )
    gLogger.verbose( "Service URL is %s" % self._url )
    #Discover Handler
    self._initMonitoring()
    self._threadPool = ThreadPool( 1,
                                    max( 0, self._cfg.getMaxThreads() ),
                                    self._cfg.getMaxWaitingPetitions() )
    self._threadPool.daemonize()
    self._msgBroker = MessageBroker( "%sMSB" % GatewayService.GATEWAY_NAME, threadPool = self._threadPool )
    self._msgBroker.useMessageObjects( False )
    getGlobalMessageBroker().useMessageObjects( False )
    self._msgForwarder = MessageForwarder( self._msgBroker )
    return S_OK()

  #Threaded process function
  def _processInThread( self, clientTransport ):
    #Handshake
    try:
      clientTransport.handshake()
    except:
      return
    #Add to the transport pool
    trid = self._transportPool.add( clientTransport )
    if not trid:
      return
    #Receive and check proposal
    result = self._receiveAndCheckProposal( trid )
    if not result[ 'OK' ]:
      self._transportPool.sendAndClose( trid, result )
      return
    proposalTuple = result[ 'Value' ]
    #Instantiate handler
    result = self.__getClientInitArgs( trid, proposalTuple )
    if not result[ 'OK' ]:
      self._transportPool.sendAndClose( trid, result )
      return
    clientInitArgs = result[ 'Value' ]
    #Execute the action
    result = self._processProposal( trid, proposalTuple, clientInitArgs )
    #Close the connection if required
    if result[ 'closeTransport' ]:
      self._transportPool.close( trid )
    return result

  def _receiveAndCheckProposal( self, trid ):
    clientTransport = self._transportPool.get( trid )
    #Get the peer credentials
    credDict = clientTransport.getConnectingCredentials()
    #Receive the action proposal
    retVal = clientTransport.receiveData( 1024 )
    if not retVal[ 'OK' ]:
      gLogger.error( "Invalid action proposal", "%s %s" % ( self._createIdentityString( credDict,
                                                                                        clientTransport ),
                                                            retVal[ 'Message' ] ) )
      return S_ERROR( "Invalid action proposal" )
    proposalTuple = retVal[ 'Value' ]
    gLogger.debug( "Received action from client", "/".join( list( proposalTuple[1] ) ) )
    #Check if there are extra credentials
    if proposalTuple[2]:
      clientTransport.setExtraCredentials( proposalTuple[2] )
    return S_OK( proposalTuple )

  def __getClientInitArgs( self, trid, proposalTuple ):
    clientTransport = self._transportPool.get( trid )
    #Get the peer credentials
    credDict = clientTransport.getConnectingCredentials()
    if 'x509Chain' not in credDict:
      return S_OK()
    cKey = ( credDict[ 'DN' ],
             credDict.get( 'group', False ),
             credDict.get( 'extraCredentials', False ),
             credDict[ 'isLimitedProxy' ] )
    dP = self.__delegatedCredentials.get( cKey, 3600 )
    idString = self._createIdentityString( credDict, clientTransport )
    if dP:
      gLogger.verbose( "Proxy for %s is cached" % idString )
      return S_OK( dP )
    result = self.__requestDelegation( clientTransport, credDict )
    if not result[ 'OK' ]:
      gLogger.warn( "Could not get proxy for %s: %s" % ( idString, result[ 'Message' ] ) )
      return result
    delChain = result[ 'Value' ]
    delegatedChain = delChain.dumpAllToString()[ 'Value' ]
    secsLeft = delChain.getRemainingSecs()[ 'Value' ] - 1
    clientInitArgs = {
                        BaseClient.KW_SETUP : proposalTuple[0][1],
                        BaseClient.KW_TIMEOUT : 600,
                        BaseClient.KW_IGNORE_GATEWAYS : True,
                        BaseClient.KW_USE_CERTIFICATES : False,
                        BaseClient.KW_PROXY_STRING : delegatedChain
                        }
    if BaseClient.KW_EXTRA_CREDENTIALS in credDict:
      clientInitArgs[ BaseClient.KW_EXTRA_CREDENTIALS ] = credDict[ BaseClient.KW_EXTRA_CREDENTIALS ]
    gLogger.warn( "Got delegated proxy for %s: %s secs left" % ( idString, secsLeft ) )
    self.__delegatedCredentials.add( cKey, secsLeft, clientInitArgs )
    return S_OK( clientInitArgs )

  def __requestDelegation( self, clientTransport, credDict ):
    peerChain = credDict[ 'x509Chain' ]
    retVal = peerChain.getCertInChain()[ 'Value' ].generateProxyRequest()
    if not retVal[ 'OK' ]:
      return retVal
    delegationRequest = retVal[ 'Value' ]
    retVal = delegationRequest.dumpRequest()
    if not retVal[ 'OK' ]:
      retVal = S_ERROR( "Server Error: Can't generate delegation request" )
      clientTransport.sendData( retVal )
      return retVal
    gLogger.info( "Sending delegation request for %s" % delegationRequest.getSubjectDN()[ 'Value' ] )
    clientTransport.sendData( S_OK( { 'delegate' : retVal[ 'Value' ] } ) )
    delegatedCertChain = clientTransport.receiveData()
    delegatedChain = X509Chain( keyObj = delegationRequest.getPKey() )
    retVal = delegatedChain.loadChainFromString( delegatedCertChain )
    if not retVal[ 'OK' ]:
      retVal = S_ERROR( "Error in receiving delegated proxy: %s" % retVal[ 'Message' ] )
      clientTransport.sendData( retVal )
      return retVal
    return S_OK( delegatedChain )

  #Msg

  def _mbConnect( self, trid, clientInitArgs ):
    return S_OK()

  def _mbReceivedMsg( self, cliTrid, msgObj ):
    return self._msgForwarder.msgFromClient( cliTrid, msgObj )

  def _mbDisconnect( self, cliTrid ):
    self._msgForwarder.cliDisconnect( cliTrid )

  #Execute action

  def _executeAction( self, trid, proposalTuple, clientInitArgs ):
    clientTransport = self._transportPool.get( trid )
    credDict = clientTransport.getConnectingCredentials()
    targetService = proposalTuple[0][0]
    actionType = proposalTuple[1][0]
    actionMethod = proposalTuple[1][1]
    idString = self._createIdentityString( credDict, clientTransport )
    #OOkay! Lets do the magic!
    retVal = clientTransport.receiveData()
    if not retVal[ 'OK' ]:
      gLogger.error( "Error while receiving file description", retVal[ 'Message' ] )
      clientTransport.sendData( S_ERROR( "Error while receiving file description: %s" % retVal[ 'Message' ] ) )
      return
    if actionType == "FileTransfer":
      gLogger.warn( "Received a file transfer action from %s" % idString )
      clientTransport.sendData( S_OK( "Accepted" ) )
      retVal = self.__forwardFileTransferCall( targetService, clientInitArgs,
                                                actionMethod, retVal[ 'Value' ], clientTransport )
    elif actionType == "RPC":
      gLogger.info( "Forwarding %s/%s action to %s for %s" % ( actionType, actionMethod, targetService, idString ) )
      retVal = self.__forwardRPCCall( targetService, clientInitArgs, actionMethod, retVal[ 'Value' ] )
    elif actionType == "Connection" and actionMethod == "new":
      gLogger.info( "Initiating a messaging connection to %s for %s" % ( targetService, idString ) )
      retVal = self._msgForwarder.addClient( trid, targetService, clientInitArgs, retVal[ 'Value' ] )
    else:
      gLogger.warn( "Received an invalid %s/%s action from %s" % ( actionType, actionMethod, idString ) )
      retVal = S_ERROR( "Unknown type of action (%s)" % actionType )
    #TODO: Send back the data?
    if 'rpcStub' in retVal:
      retVal.pop( 'rpcStub' )
    clientTransport.sendData( retVal )
    return retVal

  def __forwardRPCCall( self, targetService, clientInitArgs, method, params ):
    if targetService == "Configuration/Server":
      if method == "getCompressedDataIfNewer":
        #Relay CS data directly
        serviceVersion = gConfigurationData.getVersion()
        retDict = { 'newestVersion' : serviceVersion }
        clientVersion = params[0]
        if clientVersion < serviceVersion:
          retDict[ 'data' ] = gConfigurationData.getCompressedData()
        return S_OK( retDict )
    #Default
    rpcClient = RPCClient( targetService, **clientInitArgs )
    methodObj = getattr( rpcClient, method )
    return methodObj( *params )

  def __forwardFileTransferCall( self, targetService, clientInitArgs, method,
                                 params, clientTransport ):
    transferRelay = TransferRelay( targetService, **clientInitArgs )
    transferRelay.setTransferLimit( self.__transferBytesLimit )
    cliFH = FileHelper( clientTransport )
    #Check file size
    if method.find( "ToClient" ) > -1:
      cliFH.setDirection( "send" )
    elif method.find( "FromClient" ) > -1:
      cliFH.setDirection( "receive" )
      if not self.__ftCheckMaxTransferSize( params[2] ):
        cliFH.markAsTransferred()
        return S_ERROR( "Transfer size is too big" )
    #Forward queries
    try:
      relayMethodObject = getattr( transferRelay, 'forward%s' % method )
    except:
      return S_ERROR( "Cannot forward unknown method %s" % method )
    result = relayMethodObject( cliFH, params )
    return result

  def __ftCheckMaxTransferSize( self, requestedTransferSize ):
    if not self.__transferBytesLimit:
      return True
    if not requestedTransferSize:
      return True
    if requestedTransferSize <= self.__transferBytesLimit:
      return True
    return False
コード例 #18
0
class FTSMonitorAgent( AgentModule ):
  """
  .. class:: FTSMonitorAgent

  Monitor submitted FTS jobs.
  """
  # # transfer DB handle
  transferDB = None
  # # thread pool
  threadPool = None
  # # min threads
  minThreads = 1
  # # max threads
  maxThreads = 10

  # # missing source regexp patterns
  missingSourceErrors = [
    re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] Failed" ),
    re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] No such file or directory" ),
    re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] Failed" ),
    re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] The requested file either does not exist" ),
    re.compile( r"TRANSFER error during TRANSFER phase: \[INVALID_PATH\] the server sent an error response: 500 500"\
               " Command failed. : open error: No such file or directory" ),
    re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[USER_ERROR\] source file doesnt exist" ) ]

  def initialize( self ):
    """ agent's initialisation """
    self.transferDB = TransferDB()
    self.am_setOption( "shifterProxy", "DataManager" )
    self.minThreads = self.am_getOption( "MinThreads", self.minThreads )
    self.maxThreads = self.am_getOption( "MaxThreads", self.maxThreads )
    minmax = ( abs( self.minThreads ), abs( self.maxThreads ) )
    self.minThreads, self.maxThreads = min( minmax ), max( minmax )
    self.log.info( "ThreadPool min threads = %s" % self.minThreads )
    self.log.info( "ThreadPool max threads = %s" % self.maxThreads )
    self.threadPool = ThreadPool( self.minThreads, self.maxThreads )
    self.threadPool.daemonize()
    return S_OK()

  def execute( self ):
    """ push jobs to the thread pool """
    self.log.info( "Obtaining requests to monitor" )
    res = self.transferDB.getFTSReq()
    if not res["OK"]:
      self.log.error( "Failed to get FTS requests", res['Message'] )
      return res
    if not res["Value"]:
      self.log.info( "No FTS requests found to monitor." )
      return S_OK()
    ftsReqs = res["Value"]
    self.log.info( "Found %s FTS jobs" % len( ftsReqs ) )
    i = 1
    for ftsJob in ftsReqs:
      while True:
        self.log.debug( "submitting FTS Job %s FTSReqID=%s to monitor" % ( i, ftsJob["FTSReqID"] ) )
        ret = self.threadPool.generateJobAndQueueIt( self.monitorTransfer, args = ( ftsJob, ), )
        if ret["OK"]:
          i += 1
          break
        # # sleep 1 second to proceed
        time.sleep( 1 )

    self.threadPool.processAllResults()
    return S_OK()

  def ftsJobExpired( self, ftsReqID, channelID ):
    """ clean up when FTS job had expired on the server side

    :param int ftsReqID: FTSReq.FTSReqID
    :param int channelID: FTSReq.ChannelID
    """
    log = gLogger.getSubLogger( "@%s" % str( ftsReqID ) )
    fileIDs = self.transferDB.getFTSReqFileIDs( ftsReqID )
    if not fileIDs["OK"]:
      log.error( "Unable to retrieve FileIDs associated to %s request" % ftsReqID )
      return fileIDs
    fileIDs = fileIDs["Value"]

    # # update FileToFTS table, this is just a clean up, no worry if somethings goes wrong
    for fileID in fileIDs:
      fileStatus = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID,
                                                              "Status", "Failed" )
      if not fileStatus["OK"]:
        log.error( "Unable to set FileToFTS status to 'Failed' for FileID %s: %s" % ( fileID,
                                                                                     fileStatus["Message"] ) )

      failReason = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID,
                                                              "Reason", "FTS job expired on server" )
      if not failReason["OK"]:
        log.error( "Unable to set FileToFTS reason for FileID %s: %s" % ( fileID,
                                                                         failReason["Message"] ) )
    # # update Channel table
    resetChannels = self.transferDB.resetFileChannelStatus( channelID, fileIDs )
    if not resetChannels["OK"]:
      log.error( "Failed to reset Channel table for files to retry" )
      return resetChannels

    # # update FTSReq table
    log.info( "Setting FTS request status to 'Finished'" )
    ftsReqStatus = self.transferDB.setFTSReqStatus( ftsReqID, "Finished" )
    if not ftsReqStatus["OK"]:
      log.error( "Failed update FTS Request status", ftsReqStatus["Message"] )
      return ftsReqStatus

    # # if we land here, everything should be OK
    return S_OK()

  def monitorTransfer( self, ftsReqDict ):
    """ monitors transfer obtained from TransferDB

    :param dict ftsReqDict: FTS job dictionary
    """
    ftsReqID = ftsReqDict.get( "FTSReqID" )
    ftsGUID = ftsReqDict.get( "FTSGuid" )
    ftsServer = ftsReqDict.get( "FTSServer" )
    channelID = ftsReqDict.get( "ChannelID" )
    sourceSE = ftsReqDict.get( "SourceSE" )
    targetSE = ftsReqDict.get( "TargetSE" )

    oFTSRequest = FTSRequest()
    oFTSRequest.setFTSServer( ftsServer )
    oFTSRequest.setFTSGUID( ftsGUID )
    oFTSRequest.setSourceSE( sourceSE )
    oFTSRequest.setTargetSE( targetSE )

    log = gLogger.getSubLogger( "@%s" % str( ftsReqID ) )

    #########################################################################
    # Perform summary update of the FTS Request and update FTSReq entries.
    log.info( "Perform summary update of the FTS Request" )
    infoStr = [ "glite-transfer-status -s %s -l %s" % ( ftsServer, ftsGUID ) ]
    infoStr.append( "FTS GUID:   %s" % ftsGUID )
    infoStr.append( "FTS Server: %s" % ftsServer )
    log.info( "\n".join( infoStr ) )
    res = oFTSRequest.summary()
    self.transferDB.setFTSReqLastMonitor( ftsReqID )
    if not res["OK"]:
      log.error( "Failed to update the FTS request summary", res["Message"] )
      if "getTransferJobSummary2: Not authorised to query request" in res["Message"]:
        log.error( "FTS job is not existing at the FTS server anymore, will clean it up on TransferDB side" )
        cleanUp = self.ftsJobExpired( ftsReqID, channelID )
        if not cleanUp["OK"]:
          log.error( cleanUp["Message"] )
        return cleanUp
      return res

    res = oFTSRequest.dumpSummary()
    if not res['OK']:
      log.error( "Failed to get FTS request summary", res["Message"] )
      return res
    log.info( res['Value'] )
    res = oFTSRequest.getPercentageComplete()
    if not res['OK']:
      log.error( "Failed to get FTS percentage complete", res["Message"] )
      return res
    log.info( 'FTS Request found to be %.1f percent complete' % res["Value"] )
    self.transferDB.setFTSReqAttribute( ftsReqID, "PercentageComplete", res["Value"] )
    self.transferDB.addLoggingEvent( ftsReqID, res["Value"] )

    #########################################################################
    # Update the information in the TransferDB if the transfer is terminal.
    res = oFTSRequest.isRequestTerminal()
    if not res["OK"]:
      log.error( "Failed to determine whether FTS request terminal", res["Message"] )
      return res
    if not res["Value"]:
      return S_OK()
    # # request is terminal
    return self.terminalRequest( oFTSRequest, ftsReqID, channelID, sourceSE )

  def terminalRequest( self, oFTSRequest, ftsReqID, channelID, sourceSE ):
    """ process terminal FTS job

    :param FTSRequest oFTSRequest: FTSRequest instance
    :param int ftsReqID: FTSReq.FTSReqID
    :param int channelID: FTSReq.ChannelID
    :param str sourceSE: FTSReq.SourceSE
    """
    log = gLogger.getSubLogger( "@%s" % ftsReqID )

    log.info( "FTS Request found to be terminal, updating file states" )
    #########################################################################
    # Get the LFNS associated to the FTS request
    log.info( "Obtaining the LFNs associated to this request" )
    res = self.transferDB.getFTSReqLFNs( ftsReqID, channelID, sourceSE )
    if not res["OK"]:
      log.error( "Failed to obtain FTS request LFNs", res['Message'] )
      return res
    files = res["Value"]
    if not files:
      log.error( "No files present for transfer" )
      return S_ERROR( "No files were found in the DB" )

    lfns = files.keys()
    log.debug( "Obtained %s files" % len( lfns ) )
    for lfn in lfns:
      oFTSRequest.setLFN( lfn )

    res = oFTSRequest.monitor()
    if not res["OK"]:
      log.error( "Failed to perform detailed monitoring of FTS request", res["Message"] )
      return res
    res = oFTSRequest.getFailed()
    if not res["OK"]:
      log.error( "Failed to obtained failed files for FTS request", res["Message"] )
      return res
    failedFiles = res["Value"]
    res = oFTSRequest.getDone()
    if not res["OK"]:
      log.error( "Failed to obtained successful files for FTS request", res["Message"] )
      return res
    completedFiles = res["Value"]

    # An LFN can be included more than once if it was entered into more than one Request.
    # FTS will only do the transfer once. We need to identify all FileIDs
    res = self.transferDB.getFTSReqFileIDs( ftsReqID )
    if not res["OK"]:
      log.error( "Failed to get FileIDs associated to FTS Request", res["Message"] )
      return res
    fileIDs = res["Value"]
    res = self.transferDB.getAttributesForFilesList( fileIDs, ["LFN"] )
    if not res["OK"]:
      log.error( "Failed to get LFNs associated to FTS Request", res["Message"] )
      return res
    fileIDDict = res["Value"]

    fileToFTSUpdates = []
    completedFileIDs = []
    filesToRetry = []
    filesToFail = []

    for fileID, fileDict in fileIDDict.items():
      lfn = fileDict['LFN']
      if lfn in completedFiles:
        completedFileIDs.append( fileID )
        transferTime = 0
        res = oFTSRequest.getTransferTime( lfn )
        if res["OK"]:
          transferTime = res["Value"]
        fileToFTSUpdates.append( ( fileID, "Completed", "", 0, transferTime ) )

      if lfn in failedFiles:
        failReason = ""
        res = oFTSRequest.getFailReason( lfn )
        if res["OK"]:
          failReason = res["Value"]
        if "Source file/user checksum mismatch" in failReason:
          filesToFail.append( fileID )
          continue
        if self.missingSource( failReason ):
          log.error( "The source SURL does not exist.", "%s %s" % ( lfn, oFTSRequest.getSourceSURL( lfn ) ) )
          filesToFail.append( fileID )
        else:
          filesToRetry.append( fileID )
        log.error( "Failed to replicate file on channel.", "%s %s" % ( channelID, failReason ) )
        fileToFTSUpdates.append( ( fileID, "Failed", failReason, 0, 0 ) )

    # # update TransferDB.FileToFTS table
    updateFileToFTS = self.updateFileToFTS( ftsReqID, channelID,
                                            filesToRetry, filesToFail,
                                            completedFileIDs, fileToFTSUpdates )

    if updateFileToFTS["OK"] and updateFileToFTS["Value"]:
      res = oFTSRequest.finalize()
      if not res["OK"]:
        log.error( "Failed to perform the finalization for the FTS request", res["Message"] )
        return res

      log.info( 'Adding logging event for FTS request' )
      # Now set the FTSReq status to terminal so that it is not monitored again
      res = self.transferDB.addLoggingEvent( ftsReqID, 'Finished' )
      if not res['OK']:
        log.error( 'Failed to add logging event for FTS Request', res['Message'] )

      # update TransferDB.FileToCat table
      updateFileToCat = self.updateFileToCat( oFTSRequest, channelID, fileIDDict, completedFiles, filesToFail )
      if not updateFileToCat["OK"]:
        log.error( updateFileToCat["Message"] )

      log.debug( "Updating FTS request status" )
      res = self.transferDB.setFTSReqStatus( ftsReqID, 'Finished' )
      if not res['OK']:
        log.error( 'Failed update FTS Request status', res['Message'] )
    return S_OK()


  def updateFileToFTS( self, ftsReqID, channelID, filesToRetry, filesToFail, completedFileIDs, fileToFTSUpdates ):
    """ update TransferDB.FileToFTS table for finished request

    :param int ftsReqID: FTSReq.FTSReqID
    :param int channelID: FTSReq.ChannelID
    :param list filesToRetry: FileIDs to retry
    :param list filesToFail: FileIDs for failed files
    :param list completedFileIDs: files completed
    :param list fileToFTSUpdates: ???
    """
    log = gLogger.getSubLogger( "@%s" % ftsReqID )

    allUpdated = True

    res = self.transferDB.resetFileChannelStatus( channelID, filesToRetry ) if filesToRetry else S_OK()
    if not res["OK"]:
      log.error( "Failed to update the Channel table for file to retry.", res["Message"] )
      allUpdated = False

    for fileID in filesToFail:
      log.info( "Updating the Channel table for files to reschedule" )
      res = self.transferDB.setFileToReschedule( fileID )
      if not res["OK"]:
        log.error( "Failed to update Channel table for failed files.", res["Message"] )
        allUpdated = False
      elif res["Value"] == "max reschedule attempt reached":
        log.error( "setting Channel status to 'Failed' : " % res["Value"] )
        res = self.transferDB.setFileChannelStatus( channelID, fileID, 'Failed' )
        if not res["OK"]:
          log.error( "Failed to update Channel table for failed files.", res["Message"] )
          allUpdated = False

    if completedFileIDs:
      res = self.transferDB.updateCompletedChannelStatus( channelID, completedFileIDs )
      if not res["OK"]:
        log.error( "Failed to update the Channel table for successful files.", res["Message"] )
        allUpdated = False
      res = self.transferDB.updateAncestorChannelStatus( channelID, completedFileIDs )
      if not res["OK"]:
        log.error( 'Failed to update the Channel table for ancestors of successful files.', res['Message'] )
        allUpdated = False

    if fileToFTSUpdates:
      res = self.transferDB.setFileToFTSFileAttributes( ftsReqID, channelID, fileToFTSUpdates )
      if not res["OK"]:
        log.error( "Failed to update the FileToFTS table for files.", res["Message"] )
        allUpdated = False

    return S_OK( allUpdated )

  def updateFileToCat( self, oFTSRequest, channelID, fileIDDict, completedFiles, filesToFail ):
    """ update TransferDB.FileToCat table for finished request

    :param FTSRequest oFTSRequest: FTSRequest instance
    :param int ftsReqID: FTSReq.FTSReqID
    :param dict fileIDDict: fileIDs dictionary
    :param int channelID: FTSReq.ChannelID
    """
    res = oFTSRequest.getFailedRegistrations()
    failedRegistrations = res["Value"]
    regFailedFileIDs = []
    regDoneFileIDs = []
    regForgetFileIDs = []
    for fileID, fileDict in fileIDDict.items():
      lfn = fileDict['LFN']

      if lfn in failedRegistrations:
        regFailedFileIDs.append( fileID )
        # if the LFN appears more than once, FileToCat needs to be reset only once
        del failedRegistrations[lfn]
      elif lfn in completedFiles:
        regDoneFileIDs.append( fileID )
      elif fileID in filesToFail:
        regForgetFileIDs.append( fileID )

    res = self.transferDB.setRegistrationWaiting( channelID, regFailedFileIDs ) if regFailedFileIDs else S_OK()
    if not res["OK"]:
      res["Message"] = "Failed to reset entries in FileToCat: %s" % res["Message"]
      return res

    res = self.transferDB.setRegistrationDone( channelID, regDoneFileIDs ) if regDoneFileIDs else S_OK()
    if not res["OK"]:
      res["Message"] = "Failed to set entries Done in FileToCat: %s" % res["Message"]
      return res

    # This entries could also be set to Failed, but currently there is no method to do so.
    res = self.transferDB.setRegistrationDone( channelID, regForgetFileIDs ) if regForgetFileIDs else S_OK()
    if not res["OK"]:
      res["Message"] = "Failed to set entries Done in FileToCat: %s" % res["Message"]
      return res

    return S_OK()

  @classmethod
  def missingSource( cls, failReason ):
    """ check if message sent by FTS server is concering missing source file

    :param str failReason: message sent by FTS server
    """
    for error in cls.missingSourceErrors:
      if error.search( failReason ):
        return 1
    return 0
コード例 #19
0
ファイル: Service.py プロジェクト: sparsh35/DIRAC
class Service(object):

    SVC_VALID_ACTIONS = {
        'RPC': 'export',
        'FileTransfer': 'transfer',
        'Message': 'msg',
        'Connection': 'Message'
    }
    SVC_SECLOG_CLIENT = SecurityLogClient()

    def __init__(self, serviceData):
        """
      Init the variables for the service

      :param serviceData: dict with modName, standalone, loadName, moduleObj, classObj. e.g.:
        {'modName': 'Framework/serviceName',
        'standalone': True,
        'loadName': 'Framework/serviceName',
        'moduleObj': <module 'serviceNameHandler' from '/home/DIRAC/FrameworkSystem/Service/serviceNameHandler.pyo'>,
        'classObj': <class 'serviceNameHandler.serviceHandler'>}

        Standalone is true if there is only one service started
        If it's false, every service is linked to a different MonitoringClient
    """
        self._svcData = serviceData
        self._name = serviceData['modName']
        self._startTime = Time.dateTime()
        self._validNames = [serviceData['modName']]
        if serviceData['loadName'] not in self._validNames:
            self._validNames.append(serviceData['loadName'])
        self._cfg = ServiceConfiguration(list(self._validNames))
        if serviceData['standalone']:
            self._monitor = gMonitor
        else:
            self._monitor = MonitoringClient()
        self.__monitorLastStatsUpdate = time.time()
        self._stats = {'queries': 0, 'connections': 0}
        self._authMgr = AuthManager(
            "%s/Authorization" %
            PathFinder.getServiceSection(serviceData['loadName']))
        self._transportPool = getGlobalTransportPool()
        self.__cloneId = 0
        self.__maxFD = 0

    def setCloneProcessId(self, cloneId):
        self.__cloneId = cloneId
        self._monitor.setComponentName("%s-Clone:%s" % (self._name, cloneId))

    def _isMetaAction(self, action):
        referedAction = Service.SVC_VALID_ACTIONS[action]
        if referedAction in Service.SVC_VALID_ACTIONS:
            return referedAction
        return False

    def initialize(self):
        # Build the URLs
        self._url = self._cfg.getURL()
        if not self._url:
            return S_ERROR("Could not build service URL for %s" % self._name)
        gLogger.verbose("Service URL is %s" % self._url)
        # Load handler
        result = self._loadHandlerInit()
        if not result['OK']:
            return result
        self._handler = result['Value']
        # Initialize lock manager
        self._lockManager = LockManager(self._cfg.getMaxWaitingPetitions())
        self._initMonitoring()
        # TODO: remove ThreadPool
        if useThreadPoolExecutor:
            self._threadPool = ThreadPoolExecutor(
                max(0, self._cfg.getMaxThreads()))
        else:
            self._threadPool = ThreadPool(max(1, self._cfg.getMinThreads()),
                                          max(0, self._cfg.getMaxThreads()),
                                          self._cfg.getMaxWaitingPetitions())
            self._threadPool.daemonize()
        self._msgBroker = MessageBroker("%sMSB" % self._name,
                                        threadPool=self._threadPool)
        # Create static dict
        self._serviceInfoDict = {
            'serviceName':
            self._name,
            'serviceSectionPath':
            PathFinder.getServiceSection(self._name),
            'URL':
            self._cfg.getURL(),
            'messageSender':
            MessageSender(self._name, self._msgBroker),
            'validNames':
            self._validNames,
            'csPaths': [
                PathFinder.getServiceSection(svcName)
                for svcName in self._validNames
            ]
        }
        # Call static initialization function
        try:
            self._handler['class']._rh__initializeClass(
                dict(self._serviceInfoDict), self._lockManager,
                self._msgBroker, self._monitor)
            if self._handler['init']:
                for initFunc in self._handler['init']:
                    gLogger.verbose("Executing initialization function")
                    try:
                        result = initFunc(dict(self._serviceInfoDict))
                    except Exception as excp:
                        gLogger.exception(
                            "Exception while calling initialization function",
                            lException=excp)
                        return S_ERROR(
                            "Exception while calling initialization function: %s"
                            % str(excp))
                    if not isReturnStructure(result):
                        return S_ERROR(
                            "Service initialization function %s must return S_OK/S_ERROR"
                            % initFunc)
                    if not result['OK']:
                        return S_ERROR("Error while initializing %s: %s" %
                                       (self._name, result['Message']))
        except Exception as e:
            errMsg = "Exception while initializing %s" % self._name
            gLogger.exception(e)
            gLogger.exception(errMsg)
            return S_ERROR(errMsg)

        # Load actions after the handler has initialized itself
        result = self._loadActions()
        if not result['OK']:
            return result
        self._actions = result['Value']

        gThreadScheduler.addPeriodicTask(30, self.__reportThreadPoolContents)

        return S_OK()

    def __searchInitFunctions(self, handlerClass, currentClass=None):
        if not currentClass:
            currentClass = handlerClass
        initFuncs = []
        ancestorHasInit = False
        for ancestor in currentClass.__bases__:
            initFuncs += self.__searchInitFunctions(handlerClass, ancestor)
            if 'initializeHandler' in dir(ancestor):
                ancestorHasInit = True
        if ancestorHasInit:
            initFuncs.append(
                super(currentClass, handlerClass).initializeHandler)
        if currentClass == handlerClass and 'initializeHandler' in dir(
                handlerClass):
            initFuncs.append(handlerClass.initializeHandler)
        return initFuncs

    def _loadHandlerInit(self):
        handlerClass = self._svcData['classObj']
        handlerName = handlerClass.__name__
        handlerInitMethods = self.__searchInitFunctions(handlerClass)
        try:
            handlerInitMethods.append(
                getattr(self._svcData['moduleObj'],
                        "initialize%s" % handlerName))
        except AttributeError:
            gLogger.verbose(
                "Not found global initialization function for service")

        if handlerInitMethods:
            gLogger.info("Found %s initialization methods" %
                         len(handlerInitMethods))

        handlerInfo = {}
        handlerInfo["name"] = handlerName
        handlerInfo["module"] = self._svcData['moduleObj']
        handlerInfo["class"] = handlerClass
        handlerInfo["init"] = handlerInitMethods

        return S_OK(handlerInfo)

    def _loadActions(self):

        handlerClass = self._handler['class']

        authRules = {}
        typeCheck = {}
        methodsList = {}
        for actionType in Service.SVC_VALID_ACTIONS:
            if self._isMetaAction(actionType):
                continue
            authRules[actionType] = {}
            typeCheck[actionType] = {}
            methodsList[actionType] = []
        handlerAttributeList = dir(handlerClass)
        for actionType in Service.SVC_VALID_ACTIONS:
            if self._isMetaAction(actionType):
                continue
            methodPrefix = '%s_' % Service.SVC_VALID_ACTIONS[actionType]
            for attribute in handlerAttributeList:
                if attribute.find(methodPrefix) != 0:
                    continue
                exportedName = attribute[len(methodPrefix):]
                methodsList[actionType].append(exportedName)
                gLogger.verbose("+ Found %s method %s" %
                                (actionType, exportedName))
                # Create lock for method
                self._lockManager.createLock(
                    "%s/%s" % (actionType, exportedName),
                    self._cfg.getMaxThreadsForMethod(actionType, exportedName))
                # Look for type and auth rules
                if actionType == 'RPC':
                    typeAttr = "types_%s" % exportedName
                    authAttr = "auth_%s" % exportedName
                else:
                    typeAttr = "types_%s_%s" % (
                        Service.SVC_VALID_ACTIONS[actionType], exportedName)
                    authAttr = "auth_%s_%s" % (
                        Service.SVC_VALID_ACTIONS[actionType], exportedName)
                if typeAttr in handlerAttributeList:
                    obj = getattr(handlerClass, typeAttr)
                    gLogger.verbose("|- Found type definition %s: %s" %
                                    (typeAttr, str(obj)))
                    typeCheck[actionType][exportedName] = obj
                if authAttr in handlerAttributeList:
                    obj = getattr(handlerClass, authAttr)
                    gLogger.verbose("|- Found auth rules %s: %s" %
                                    (authAttr, str(obj)))
                    authRules[actionType][exportedName] = obj

        for actionType in Service.SVC_VALID_ACTIONS:
            referedAction = self._isMetaAction(actionType)
            if not referedAction:
                continue
            gLogger.verbose("Action %s is a meta action for %s" %
                            (actionType, referedAction))
            authRules[actionType] = []
            for method in authRules[referedAction]:
                for prop in authRules[referedAction][method]:
                    if prop not in authRules[actionType]:
                        authRules[actionType].append(prop)
            gLogger.verbose("Meta action %s props are %s" %
                            (actionType, authRules[actionType]))

        return S_OK({
            'methods': methodsList,
            'auth': authRules,
            'types': typeCheck
        })

    def _initMonitoring(self):
        # Init extra bits of monitoring
        self._monitor.setComponentType(MonitoringClient.COMPONENT_SERVICE)
        self._monitor.setComponentName(self._name)
        self._monitor.setComponentLocation(self._cfg.getURL())
        self._monitor.initialize()
        self._monitor.registerActivity("Connections", "Connections received",
                                       "Framework", "connections",
                                       MonitoringClient.OP_RATE)
        self._monitor.registerActivity("Queries", "Queries served",
                                       "Framework", "queries",
                                       MonitoringClient.OP_RATE)
        self._monitor.registerActivity('CPU', "CPU Usage", 'Framework',
                                       "CPU,%", MonitoringClient.OP_MEAN, 600)
        self._monitor.registerActivity('MEM', "Memory Usage", 'Framework',
                                       'Memory,MB', MonitoringClient.OP_MEAN,
                                       600)
        self._monitor.registerActivity('PendingQueries', "Pending queries",
                                       'Framework', 'queries',
                                       MonitoringClient.OP_MEAN)
        self._monitor.registerActivity('ActiveQueries', "Active queries",
                                       'Framework', 'threads',
                                       MonitoringClient.OP_MEAN)
        self._monitor.registerActivity('RunningThreads', "Running threads",
                                       'Framework', 'threads',
                                       MonitoringClient.OP_MEAN)
        self._monitor.registerActivity('MaxFD', "Max File Descriptors",
                                       'Framework', 'fd',
                                       MonitoringClient.OP_MEAN)

        self._monitor.setComponentExtraParam('DIRACVersion', DIRAC.version)
        self._monitor.setComponentExtraParam('platform', DIRAC.getPlatform())
        self._monitor.setComponentExtraParam('startTime', Time.dateTime())
        for prop in (("__RCSID__", "version"), ("__doc__", "description")):
            try:
                value = getattr(self._handler['module'], prop[0])
            except Exception as e:
                gLogger.exception(e)
                gLogger.error("Missing property", prop[0])
                value = 'unset'
            self._monitor.setComponentExtraParam(prop[1], value)
        for secondaryName in self._cfg.registerAlsoAs():
            gLogger.info("Registering %s also as %s" %
                         (self._name, secondaryName))
            self._validNames.append(secondaryName)

        return S_OK()

    def __reportThreadPoolContents(self):
        # TODO: remove later
        if useThreadPoolExecutor:
            pendingQueries = self._threadPool._work_queue.qsize()
            activeQuereies = len(self._threadPool._threads)
        else:
            pendingQueries = self._threadPool.pendingJobs()
            activeQuereies = self._threadPool.numWorkingThreads()

        self._monitor.addMark('PendingQueries', pendingQueries)
        self._monitor.addMark('ActiveQueries', activeQuereies)
        self._monitor.addMark('RunningThreads', threading.activeCount())
        self._monitor.addMark('MaxFD', self.__maxFD)
        self.__maxFD = 0

    def getConfig(self):
        return self._cfg

    # End of initialization functions

    def handleConnection(self, clientTransport):
        """
      This method may be called by ServiceReactor.
      The method stacks openened connection in a queue, another thread
      read this queue and handle connection.

      :param clientTransport: Object wich describe opened connection (PlainTransport or SSLTransport)
    """
        self._stats['connections'] += 1
        self._monitor.setComponentExtraParam('queries',
                                             self._stats['connections'])
        # TODO: remove later
        if useThreadPoolExecutor:
            self._threadPool.submit(self._processInThread, clientTransport)
        else:
            self._threadPool.generateJobAndQueueIt(self._processInThread,
                                                   args=(clientTransport, ))

    # Threaded process function
    def _processInThread(self, clientTransport):
        """
    This method handles a RPC, FileTransfer or Connection.
    Connection may be opened via ServiceReactor.__acceptIncomingConnection


    - Do the SSL/TLS Handshake (if dips is used) and extract credentials
    - Get the action called by the client
    - Check if the client is authorized to perform ation
      - If not, connection is closed
    - Instanciate the RequestHandler (RequestHandler contain all methods callable)

    (Following is not directly in this method but it describe what happen at
    #Execute the action)
    - Notify the client we're ready to execute the action (via _processProposal)
      and call RequestHandler._rh_executeAction()
    - Receive arguments/file/something else (depending on action) in the RequestHandler
    - Executing the action asked by the client

    :param clientTransport: Object who describe the opened connection (SSLTransport or PlainTransport)

    :return: S_OK with "closeTransport" a boolean to indicate if th connection have to be closed
            e.g. after RPC, closeTransport=True

    """
        self.__maxFD = max(self.__maxFD, clientTransport.oSocket.fileno())
        self._lockManager.lockGlobal()
        try:
            monReport = self.__startReportToMonitoring()
        except Exception:
            monReport = False
        try:
            # Handshake
            try:
                result = clientTransport.handshake()
                if not result['OK']:
                    clientTransport.close()
                    return
            except BaseException:
                return
            # Add to the transport pool
            trid = self._transportPool.add(clientTransport)
            if not trid:
                return
            # Receive and check proposal
            result = self._receiveAndCheckProposal(trid)
            if not result['OK']:
                self._transportPool.sendAndClose(trid, result)
                return
            proposalTuple = result['Value']
            # Instantiate handler
            result = self._instantiateHandler(trid, proposalTuple)
            if not result['OK']:
                self._transportPool.sendAndClose(trid, result)
                return
            handlerObj = result['Value']
            # Execute the action
            result = self._processProposal(trid, proposalTuple, handlerObj)
            # Close the connection if required
            if result['closeTransport'] or not result['OK']:
                if not result['OK']:
                    gLogger.error("Error processing proposal",
                                  result['Message'])
                self._transportPool.close(trid)
            return result
        finally:
            self._lockManager.unlockGlobal()
            if monReport:
                self.__endReportToMonitoring(*monReport)

    def _createIdentityString(self, credDict, clientTransport=None):
        if 'username' in credDict:
            if 'group' in credDict:
                identity = "[%s:%s]" % (credDict['username'],
                                        credDict['group'])
            else:
                identity = "[%s:unknown]" % credDict['username']
        else:
            identity = 'unknown'
        if clientTransport:
            addr = clientTransport.getRemoteAddress()
            if addr:
                addr = "{%s:%s}" % (addr[0], addr[1])
        if 'DN' in credDict:
            identity += "(%s)" % credDict['DN']
        return identity

    @staticmethod
    def _deserializeProposalTuple(serializedProposal):
        """ We receive the proposalTuple as a list.
        Turn it into a tuple again
    """
        proposalTuple = tuple(
            tuple(x) if isinstance(x, list) else x for x in serializedProposal)
        return proposalTuple

    def _receiveAndCheckProposal(self, trid):
        clientTransport = self._transportPool.get(trid)
        # Get the peer credentials
        credDict = clientTransport.getConnectingCredentials()
        # Receive the action proposal
        retVal = clientTransport.receiveData(1024)
        if not retVal['OK']:
            gLogger.error(
                "Invalid action proposal",
                "%s %s" % (self._createIdentityString(
                    credDict, clientTransport), retVal['Message']))
            return S_ERROR("Invalid action proposal")
        proposalTuple = Service._deserializeProposalTuple(retVal['Value'])
        gLogger.debug("Received action from client",
                      "/".join(list(proposalTuple[1])))
        # Check if there are extra credentials
        if proposalTuple[2]:
            clientTransport.setExtraCredentials(proposalTuple[2])
        # Check if this is the requested service
        requestedService = proposalTuple[0][0]
        if requestedService not in self._validNames:
            return S_ERROR("%s is not up in this server" % requestedService)
        # Check if the action is valid
        requestedActionType = proposalTuple[1][0]
        if requestedActionType not in Service.SVC_VALID_ACTIONS:
            return S_ERROR("%s is not a known action type" %
                           requestedActionType)
        # Check if it's authorized
        result = self._authorizeProposal(proposalTuple[1], trid, credDict)
        if not result['OK']:
            return result
        # Proposal is OK
        return S_OK(proposalTuple)

    def _authorizeProposal(self, actionTuple, trid, credDict):
        # Find CS path for the Auth rules
        referedAction = self._isMetaAction(actionTuple[0])
        if referedAction:
            csAuthPath = "%s/Default" % actionTuple[0]
            hardcodedMethodAuth = self._actions['auth'][actionTuple[0]]
        else:
            if actionTuple[0] == 'RPC':
                csAuthPath = actionTuple[1]
            else:
                csAuthPath = "/".join(actionTuple)
            # Find if there are hardcoded auth rules in the code
            hardcodedMethodAuth = False
            if actionTuple[0] in self._actions['auth']:
                hardcodedRulesByType = self._actions['auth'][actionTuple[0]]
                if actionTuple[0] == "FileTransfer":
                    methodName = actionTuple[1][0].lower() + actionTuple[1][1:]
                else:
                    methodName = actionTuple[1]

                if methodName in hardcodedRulesByType:
                    hardcodedMethodAuth = hardcodedRulesByType[methodName]
        # Auth time!
        if not self._authMgr.authQuery(csAuthPath, credDict,
                                       hardcodedMethodAuth):
            # Get the identity string
            identity = self._createIdentityString(credDict)
            fromHost = "unknown host"
            tr = self._transportPool.get(trid)
            if tr:
                fromHost = '/'.join(
                    [str(item) for item in tr.getRemoteAddress()])
            gLogger.warn(
                "Unauthorized query", "to %s:%s by %s from %s" %
                (self._name, "/".join(actionTuple), identity, fromHost))
            result = S_ERROR(ENOAUTH, "Unauthorized query")
        else:
            result = S_OK()

        # Security log
        tr = self._transportPool.get(trid)
        if not tr:
            return S_ERROR("Client disconnected")
        sourceAddress = tr.getRemoteAddress()
        identity = self._createIdentityString(credDict)
        Service.SVC_SECLOG_CLIENT.addMessage(result['OK'], sourceAddress[0],
                                             sourceAddress[1], identity,
                                             self._cfg.getHostname(),
                                             self._cfg.getPort(), self._name,
                                             "/".join(actionTuple))
        return result

    def _instantiateHandler(self, trid, proposalTuple=None):
        """
    Generate an instance of the handler for a given service

    :param int trid: transport ID
    :param tuple proposalTuple: tuple describing the proposed action

    :return: S_OK/S_ERROR, Value is the handler object
    """
        # Generate the client params
        clientParams = {'serviceStartTime': self._startTime}
        if proposalTuple:
            # The 4th element is the client version
            clientParams['clientVersion'] = proposalTuple[3] if len(
                proposalTuple) > 3 else None
            clientParams['clientSetup'] = proposalTuple[0][1]
            if len(proposalTuple[0]) < 3:
                clientParams['clientVO'] = gConfig.getValue(
                    "/DIRAC/VirtualOrganization", "unknown")
            else:
                clientParams['clientVO'] = proposalTuple[0][2]
        clientTransport = self._transportPool.get(trid)
        if clientTransport:
            clientParams['clientAddress'] = clientTransport.getRemoteAddress()
        # Generate handler dict with per client info
        handlerInitDict = dict(self._serviceInfoDict)
        for key in clientParams:
            handlerInitDict[key] = clientParams[key]
        # Instantiate and initialize
        try:
            handlerInstance = self._handler['class'](handlerInitDict, trid)
            handlerInstance.initialize()
        except Exception as e:
            gLogger.exception("Server error while loading handler: %s" %
                              str(e))
            return S_ERROR("Server error while loading handler")
        return S_OK(handlerInstance)

    def _processProposal(self, trid, proposalTuple, handlerObj):
        # Notify the client we're ready to execute the action
        retVal = self._transportPool.send(trid, S_OK())
        if not retVal['OK']:
            return retVal

        messageConnection = False
        if proposalTuple[1] == ('Connection', 'new'):
            messageConnection = True

        if messageConnection:

            if self._msgBroker.getNumConnections(
            ) > self._cfg.getMaxMessagingConnections():
                result = S_ERROR(
                    "Maximum number of connections reached. Try later")
                result['closeTransport'] = True
                return result

            # This is a stable connection
            self._msgBroker.addTransportId(
                trid,
                self._name,
                receiveMessageCallback=self._mbReceivedMsg,
                disconnectCallback=self._mbDisconnect,
                listenToConnection=False)

        result = self._executeAction(trid, proposalTuple, handlerObj)
        if result['OK'] and messageConnection:
            self._msgBroker.listenToTransport(trid)
            result = self._mbConnect(trid, handlerObj)
            if not result['OK']:
                self._msgBroker.removeTransport(trid)

        result['closeTransport'] = not messageConnection or not result['OK']
        return result

    def _mbConnect(self, trid, handlerObj=None):
        if not handlerObj:
            result = self._instantiateHandler(trid)
            if not result['OK']:
                return result
            handlerObj = result['Value']
        return handlerObj._rh_executeConnectionCallback('connected')

    def _executeAction(self, trid, proposalTuple, handlerObj):
        try:
            return handlerObj._rh_executeAction(proposalTuple)
        except Exception as e:
            gLogger.exception("Exception while executing handler action")
            return S_ERROR("Server error while executing action: %s" % str(e))

    def _mbReceivedMsg(self, trid, msgObj):
        result = self._authorizeProposal(
            ('Message', msgObj.getName()), trid,
            self._transportPool.get(trid).getConnectingCredentials())
        if not result['OK']:
            return result
        result = self._instantiateHandler(trid)
        if not result['OK']:
            return result
        handlerObj = result['Value']
        return handlerObj._rh_executeMessageCallback(msgObj)

    def _mbDisconnect(self, trid):
        result = self._instantiateHandler(trid)
        if not result['OK']:
            return result
        handlerObj = result['Value']
        return handlerObj._rh_executeConnectionCallback('drop')

    def __startReportToMonitoring(self):
        self._monitor.addMark("Queries")
        now = time.time()
        stats = os.times()
        cpuTime = stats[0] + stats[2]
        if now - self.__monitorLastStatsUpdate < 0:
            return (now, cpuTime)
        # Send CPU consumption mark
        wallClock = now - self.__monitorLastStatsUpdate
        self.__monitorLastStatsUpdate = now
        # Send Memory consumption mark
        membytes = MemStat.VmB('VmRSS:')
        if membytes:
            mem = membytes / (1024. * 1024.)
            self._monitor.addMark('MEM', mem)
        return (now, cpuTime)

    def __endReportToMonitoring(self, initialWallTime, initialCPUTime):
        wallTime = time.time() - initialWallTime
        stats = os.times()
        cpuTime = stats[0] + stats[2] - initialCPUTime
        percentage = cpuTime / wallTime * 100.
        if percentage > 0:
            self._monitor.addMark('CPU', percentage)
コード例 #20
0
ファイル: FTSMonitorAgent.py プロジェクト: ptakha/DIRAC-1
class FTSMonitorAgent(AgentModule):
    """
  .. class:: FTSMonitorAgent

  Monitor submitted FTS jobs.
  """
    # # transfer DB handle
    transferDB = None
    # # thread pool
    threadPool = None
    # # min threads
    minThreads = 1
    # # max threads
    maxThreads = 10

    # # missing source regexp patterns
    missingSourceErrors = [
      re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] Failed" ),
      re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] No such file or directory" ),
      re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] Failed" ),
      re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] The requested file either does not exist" ),
      re.compile( r"TRANSFER error during TRANSFER phase: \[INVALID_PATH\] the server sent an error response: 500 500"\
                 " Command failed. : open error: No such file or directory" ),
      re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[USER_ERROR\] source file doesnt exist" ) ]

    def initialize(self):
        """ agent's initialisation """
        self.transferDB = TransferDB()
        self.am_setOption("shifterProxy", "DataManager")
        self.minThreads = self.am_getOption("MinThreads", self.minThreads)
        self.maxThreads = self.am_getOption("MaxThreads", self.maxThreads)
        minmax = (abs(self.minThreads), abs(self.maxThreads))
        self.minThreads, self.maxThreads = min(minmax), max(minmax)
        self.log.info("ThreadPool min threads = %s" % self.minThreads)
        self.log.info("ThreadPool max threads = %s" % self.maxThreads)
        self.threadPool = ThreadPool(self.minThreads, self.maxThreads)
        self.threadPool.daemonize()
        return S_OK()

    def execute(self):
        """ push jobs to the thread pool """
        self.log.info("Obtaining requests to monitor")
        res = self.transferDB.getFTSReq()
        if not res["OK"]:
            self.log.error("Failed to get FTS requests", res['Message'])
            return res
        if not res["Value"]:
            self.log.info("No FTS requests found to monitor.")
            return S_OK()
        ftsReqs = res["Value"]
        self.log.info("Found %s FTS jobs" % len(ftsReqs))
        i = 1
        for ftsJob in ftsReqs:
            while True:
                self.log.debug("submitting FTS Job %s FTSReqID=%s to monitor" %
                               (i, ftsJob["FTSReqID"]))
                ret = self.threadPool.generateJobAndQueueIt(
                    self.monitorTransfer,
                    args=(ftsJob, ),
                )
                if ret["OK"]:
                    i += 1
                    break
                # # sleep 1 second to proceed
                time.sleep(1)

        self.threadPool.processAllResults()
        return S_OK()

    def ftsJobExpired(self, ftsReqID, channelID):
        """ clean up when FTS job had expired on the server side

    :param int ftsReqID: FTSReq.FTSReqID
    :param int channelID: FTSReq.ChannelID
    """
        log = gLogger.getSubLogger("@%s" % str(ftsReqID))
        fileIDs = self.transferDB.getFTSReqFileIDs(ftsReqID)
        if not fileIDs["OK"]:
            log.error("Unable to retrieve FileIDs associated to %s request" %
                      ftsReqID)
            return fileIDs
        fileIDs = fileIDs["Value"]

        # # update FileToFTS table, this is just a clean up, no worry if somethings goes wrong
        for fileID in fileIDs:
            fileStatus = self.transferDB.setFileToFTSFileAttribute(
                ftsReqID, fileID, "Status", "Failed")
            if not fileStatus["OK"]:
                log.error(
                    "Unable to set FileToFTS status to 'Failed' for FileID %s: %s"
                    % (fileID, fileStatus["Message"]))

            failReason = self.transferDB.setFileToFTSFileAttribute(
                ftsReqID, fileID, "Reason", "FTS job expired on server")
            if not failReason["OK"]:
                log.error("Unable to set FileToFTS reason for FileID %s: %s" %
                          (fileID, failReason["Message"]))
        # # update Channel table
        resetChannels = self.transferDB.resetFileChannelStatus(
            channelID, fileIDs)
        if not resetChannels["OK"]:
            log.error("Failed to reset Channel table for files to retry")
            return resetChannels

        # # update FTSReq table
        log.info("Setting FTS request status to 'Finished'")
        ftsReqStatus = self.transferDB.setFTSReqStatus(ftsReqID, "Finished")
        if not ftsReqStatus["OK"]:
            log.error("Failed update FTS Request status",
                      ftsReqStatus["Message"])
            return ftsReqStatus

        # # if we land here, everything should be OK
        return S_OK()

    def monitorTransfer(self, ftsReqDict):
        """ monitors transfer obtained from TransferDB

    :param dict ftsReqDict: FTS job dictionary
    """
        ftsReqID = ftsReqDict.get("FTSReqID")
        ftsGUID = ftsReqDict.get("FTSGuid")
        ftsServer = ftsReqDict.get("FTSServer")
        channelID = ftsReqDict.get("ChannelID")
        sourceSE = ftsReqDict.get("SourceSE")
        targetSE = ftsReqDict.get("TargetSE")

        oFTSRequest = FTSRequest()
        oFTSRequest.setFTSServer(ftsServer)
        oFTSRequest.setFTSGUID(ftsGUID)
        oFTSRequest.setSourceSE(sourceSE)
        oFTSRequest.setTargetSE(targetSE)

        log = gLogger.getSubLogger("@%s" % str(ftsReqID))

        #########################################################################
        # Perform summary update of the FTS Request and update FTSReq entries.
        log.info("Perform summary update of the FTS Request")
        infoStr = ["glite-transfer-status -s %s -l %s" % (ftsServer, ftsGUID)]
        infoStr.append("FTS GUID:   %s" % ftsGUID)
        infoStr.append("FTS Server: %s" % ftsServer)
        log.info("\n".join(infoStr))
        res = oFTSRequest.summary()
        self.transferDB.setFTSReqLastMonitor(ftsReqID)
        if not res["OK"]:
            log.error("Failed to update the FTS request summary",
                      res["Message"])
            if "getTransferJobSummary2: Not authorised to query request" in res[
                    "Message"]:
                log.error(
                    "FTS job is not existing at the FTS server anymore, will clean it up on TransferDB side"
                )
                cleanUp = self.ftsJobExpired(ftsReqID, channelID)
                if not cleanUp["OK"]:
                    log.error(cleanUp["Message"])
                return cleanUp
            return res

        res = oFTSRequest.dumpSummary()
        if not res['OK']:
            log.error("Failed to get FTS request summary", res["Message"])
            return res
        log.info(res['Value'])
        res = oFTSRequest.getPercentageComplete()
        if not res['OK']:
            log.error("Failed to get FTS percentage complete", res["Message"])
            return res
        log.info('FTS Request found to be %.1f percent complete' %
                 res["Value"])
        self.transferDB.setFTSReqAttribute(ftsReqID, "PercentageComplete",
                                           res["Value"])
        self.transferDB.addLoggingEvent(ftsReqID, res["Value"])

        #########################################################################
        # Update the information in the TransferDB if the transfer is terminal.
        res = oFTSRequest.isRequestTerminal()
        if not res["OK"]:
            log.error("Failed to determine whether FTS request terminal",
                      res["Message"])
            return res
        if not res["Value"]:
            return S_OK()
        # # request is terminal
        return self.terminalRequest(oFTSRequest, ftsReqID, channelID, sourceSE)

    def terminalRequest(self, oFTSRequest, ftsReqID, channelID, sourceSE):
        """ process terminal FTS job

    :param FTSRequest oFTSRequest: FTSRequest instance
    :param int ftsReqID: FTSReq.FTSReqID
    :param int channelID: FTSReq.ChannelID
    :param str sourceSE: FTSReq.SourceSE
    """
        log = gLogger.getSubLogger("@%s" % ftsReqID)

        log.info("FTS Request found to be terminal, updating file states")
        #########################################################################
        # Get the LFNS associated to the FTS request
        log.info("Obtaining the LFNs associated to this request")
        res = self.transferDB.getFTSReqLFNs(ftsReqID, channelID, sourceSE)
        if not res["OK"]:
            log.error("Failed to obtain FTS request LFNs", res['Message'])
            return res
        files = res["Value"]
        if not files:
            log.error("No files present for transfer")
            return S_ERROR("No files were found in the DB")

        lfns = files.keys()
        log.debug("Obtained %s files" % len(lfns))
        for lfn in lfns:
            oFTSRequest.setLFN(lfn)

        res = oFTSRequest.monitor()
        if not res["OK"]:
            log.error("Failed to perform detailed monitoring of FTS request",
                      res["Message"])
            return res
        res = oFTSRequest.getFailed()
        if not res["OK"]:
            log.error("Failed to obtained failed files for FTS request",
                      res["Message"])
            return res
        failedFiles = res["Value"]
        res = oFTSRequest.getDone()
        if not res["OK"]:
            log.error("Failed to obtained successful files for FTS request",
                      res["Message"])
            return res
        completedFiles = res["Value"]

        # An LFN can be included more than once if it was entered into more than one Request.
        # FTS will only do the transfer once. We need to identify all FileIDs
        res = self.transferDB.getFTSReqFileIDs(ftsReqID)
        if not res["OK"]:
            log.error("Failed to get FileIDs associated to FTS Request",
                      res["Message"])
            return res
        fileIDs = res["Value"]
        res = self.transferDB.getAttributesForFilesList(fileIDs, ["LFN"])
        if not res["OK"]:
            log.error("Failed to get LFNs associated to FTS Request",
                      res["Message"])
            return res
        fileIDDict = res["Value"]

        fileToFTSUpdates = []
        completedFileIDs = []
        filesToRetry = []
        filesToFail = []

        for fileID, fileDict in fileIDDict.items():
            lfn = fileDict['LFN']
            if lfn in completedFiles:
                completedFileIDs.append(fileID)
                transferTime = 0
                res = oFTSRequest.getTransferTime(lfn)
                if res["OK"]:
                    transferTime = res["Value"]
                fileToFTSUpdates.append(
                    (fileID, "Completed", "", 0, transferTime))

            if lfn in failedFiles:
                failReason = ""
                res = oFTSRequest.getFailReason(lfn)
                if res["OK"]:
                    failReason = res["Value"]
                if "Source file/user checksum mismatch" in failReason:
                    filesToFail.append(fileID)
                    continue
                if self.missingSource(failReason):
                    log.error("The source SURL does not exist.",
                              "%s %s" % (lfn, oFTSRequest.getSourceSURL(lfn)))
                    filesToFail.append(fileID)
                else:
                    filesToRetry.append(fileID)
                log.error("Failed to replicate file on channel.",
                          "%s %s" % (channelID, failReason))
                fileToFTSUpdates.append((fileID, "Failed", failReason, 0, 0))

        # # update TransferDB.FileToFTS table
        updateFileToFTS = self.updateFileToFTS(ftsReqID, channelID,
                                               filesToRetry, filesToFail,
                                               completedFileIDs,
                                               fileToFTSUpdates)

        if updateFileToFTS["OK"] and updateFileToFTS["Value"]:
            res = oFTSRequest.finalize()
            if not res["OK"]:
                log.error(
                    "Failed to perform the finalization for the FTS request",
                    res["Message"])
                return res

            log.info('Adding logging event for FTS request')
            # Now set the FTSReq status to terminal so that it is not monitored again
            res = self.transferDB.addLoggingEvent(ftsReqID, 'Finished')
            if not res['OK']:
                log.error('Failed to add logging event for FTS Request',
                          res['Message'])

            # update TransferDB.FileToCat table
            updateFileToCat = self.updateFileToCat(oFTSRequest, channelID,
                                                   fileIDDict, completedFiles,
                                                   filesToFail)
            if not updateFileToCat["OK"]:
                log.error(updateFileToCat["Message"])

            log.debug("Updating FTS request status")
            res = self.transferDB.setFTSReqStatus(ftsReqID, 'Finished')
            if not res['OK']:
                log.error('Failed update FTS Request status', res['Message'])
        return S_OK()

    def updateFileToFTS(self, ftsReqID, channelID, filesToRetry, filesToFail,
                        completedFileIDs, fileToFTSUpdates):
        """ update TransferDB.FileToFTS table for finished request

    :param int ftsReqID: FTSReq.FTSReqID
    :param int channelID: FTSReq.ChannelID
    :param list filesToRetry: FileIDs to retry
    :param list filesToFail: FileIDs for failed files
    :param list completedFileIDs: files completed
    :param list fileToFTSUpdates: ???
    """
        log = gLogger.getSubLogger("@%s" % ftsReqID)

        allUpdated = True

        res = self.transferDB.resetFileChannelStatus(
            channelID, filesToRetry) if filesToRetry else S_OK()
        if not res["OK"]:
            log.error("Failed to update the Channel table for file to retry.",
                      res["Message"])
            allUpdated = False

        for fileID in filesToFail:
            log.info("Updating the Channel table for files to reschedule")
            res = self.transferDB.setFileToReschedule(fileID)
            if not res["OK"]:
                log.error("Failed to update Channel table for failed files.",
                          res["Message"])
                allUpdated = False
            elif res["Value"] == "max reschedule attempt reached":
                log.error("setting Channel status to 'Failed' : " %
                          res["Value"])
                res = self.transferDB.setFileChannelStatus(
                    channelID, fileID, 'Failed')
                if not res["OK"]:
                    log.error(
                        "Failed to update Channel table for failed files.",
                        res["Message"])
                    allUpdated = False

        if completedFileIDs:
            res = self.transferDB.updateCompletedChannelStatus(
                channelID, completedFileIDs)
            if not res["OK"]:
                log.error(
                    "Failed to update the Channel table for successful files.",
                    res["Message"])
                allUpdated = False
            res = self.transferDB.updateAncestorChannelStatus(
                channelID, completedFileIDs)
            if not res["OK"]:
                log.error(
                    'Failed to update the Channel table for ancestors of successful files.',
                    res['Message'])
                allUpdated = False

        if fileToFTSUpdates:
            res = self.transferDB.setFileToFTSFileAttributes(
                ftsReqID, channelID, fileToFTSUpdates)
            if not res["OK"]:
                log.error("Failed to update the FileToFTS table for files.",
                          res["Message"])
                allUpdated = False

        return S_OK(allUpdated)

    def updateFileToCat(self, oFTSRequest, channelID, fileIDDict,
                        completedFiles, filesToFail):
        """ update TransferDB.FileToCat table for finished request

    :param FTSRequest oFTSRequest: FTSRequest instance
    :param int ftsReqID: FTSReq.FTSReqID
    :param dict fileIDDict: fileIDs dictionary
    :param int channelID: FTSReq.ChannelID
    """
        res = oFTSRequest.getFailedRegistrations()
        failedRegistrations = res["Value"]
        regFailedFileIDs = []
        regDoneFileIDs = []
        regForgetFileIDs = []
        for fileID, fileDict in fileIDDict.items():
            lfn = fileDict['LFN']

            if lfn in failedRegistrations:
                regFailedFileIDs.append(fileID)
                # if the LFN appears more than once, FileToCat needs to be reset only once
                del failedRegistrations[lfn]
            elif lfn in completedFiles:
                regDoneFileIDs.append(fileID)
            elif fileID in filesToFail:
                regForgetFileIDs.append(fileID)

        res = self.transferDB.setRegistrationWaiting(
            channelID, regFailedFileIDs) if regFailedFileIDs else S_OK()
        if not res["OK"]:
            res["Message"] = "Failed to reset entries in FileToCat: %s" % res[
                "Message"]
            return res

        res = self.transferDB.setRegistrationDone(
            channelID, regDoneFileIDs) if regDoneFileIDs else S_OK()
        if not res["OK"]:
            res["Message"] = "Failed to set entries Done in FileToCat: %s" % res[
                "Message"]
            return res

        # This entries could also be set to Failed, but currently there is no method to do so.
        res = self.transferDB.setRegistrationDone(
            channelID, regForgetFileIDs) if regForgetFileIDs else S_OK()
        if not res["OK"]:
            res["Message"] = "Failed to set entries Done in FileToCat: %s" % res[
                "Message"]
            return res

        return S_OK()

    @classmethod
    def missingSource(cls, failReason):
        """ check if message sent by FTS server is concering missing source file

    :param str failReason: message sent by FTS server
    """
        for error in cls.missingSourceErrors:
            if error.search(failReason):
                return 1
        return 0
コード例 #21
0
ファイル: FTSAgent.py プロジェクト: JanEbbing/DIRAC
class FTSAgent( AgentModule ):
  """
  .. class:: FTSAgent

  Agent propagating Scheduled request to Done or Failed state in the FTS system.

  Requests and associated FTSJobs (and so FTSFiles) are kept in cache.

  """
  # # fts placement refresh in seconds
  FTSPLACEMENT_REFRESH = FTSHistoryView.INTERVAL / 2
  # # placeholder for max job per channel
  MAX_ACTIVE_JOBS = 50
  # # min threads
  MIN_THREADS = 1
  # # max threads
  MAX_THREADS = 10
  # # files per job
  MAX_FILES_PER_JOB = 100
  # # MAX FTS transfer per FTSFile
  MAX_ATTEMPT = 256
  # # stage flag
  PIN_TIME = 0
  # # FTS submission command
  SUBMIT_COMMAND = 'glite-transfer-submit'
  # # FTS monitoring command
  MONITOR_COMMAND = 'glite-transfer-status'
  # Max number of requests fetched from the RMS
  MAX_REQUESTS = 100
  # Minimum interval (seconds) between 2 job monitoring
  MONITORING_INTERVAL = 600

  # # placeholder for FTS client
  __ftsClient = None
  # # placeholder for the FTS version
  __ftsVersion = None
  # # placeholder for request client
  __requestClient = None
  # # placeholder for resources helper
  __resources = None
  # # placeholder for RSS client
  __rssClient = None
  # # placeholder for FTSPlacement
  __ftsPlacement = None

  # # placement regeneration time delta
  __ftsPlacementValidStamp = None

  # # placeholder for threadPool
  __threadPool = None
  # # update lock
  __updateLock = None
  # # request cache
  __reqCache = dict()

  def updateLock( self ):
    """ update lock """
    if not self.__updateLock:
      self.__updateLock = LockRing().getLock( "FTSAgentLock" )
    return self.__updateLock

  @classmethod
  def requestClient( cls ):
    """ request client getter """
    if not cls.__requestClient:
      cls.__requestClient = ReqClient()
    return cls.__requestClient

  @classmethod
  def ftsClient( cls ):
    """ FTS client """
    if not cls.__ftsClient:
      cls.__ftsClient = FTSClient()
    return cls.__ftsClient



  @classmethod
  def rssClient( cls ):
    """ RSS client getter """
    if not cls.__rssClient:
      cls.__rssClient = ResourceStatus()
    return cls.__rssClient

  @classmethod
  def getRequest( cls, reqID ):
    """ get Requests systematically and refresh cache """
    getRequest = cls.requestClient().getRequest( reqID )
    if not getRequest["OK"]:
      cls.__reqCache.pop( reqID, None )
      return getRequest
    getRequest = getRequest["Value"]
    if not getRequest:
      cls.__reqCache.pop( reqID, None )
      return S_ERROR( "request of id '%s' not found in ReqDB" % reqID )
    cls.__reqCache[reqID] = getRequest

    return S_OK( cls.__reqCache[reqID] )

  @classmethod
  def putRequest( cls, request, clearCache = True ):
    """ put request back to ReqDB

    :param Request request: Request instance
    :param bool clearCache: clear the cache?

    also finalize request if status == Done
    """
    # # put back request
    if request.RequestID not in cls.__reqCache:
      return S_OK()
    put = cls.requestClient().putRequest( request )
    if not put["OK"]:
      return put
    # # finalize first if possible
    if request.Status == "Done" and request.JobID:
      finalizeRequest = cls.requestClient().finalizeRequest( request.RequestID, request.JobID )
      if not finalizeRequest["OK"]:
        request.Status = "Scheduled"
    # # del request from cache if needed
    if clearCache:
      cls.__reqCache.pop( request.RequestID, None )
    return S_OK()

  @classmethod
  def putFTSJobs( cls, ftsJobsList ):
    """ put back fts jobs to the FTSDB """
    for ftsJob in ftsJobsList:
      put = cls.ftsClient().putFTSJob( ftsJob )
      if not put["OK"]:
        return put
    return S_OK()

  @staticmethod
  def updateFTSFileDict( ftsFilesDict, toUpdateDict ):
    """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """
    for category, ftsFileList in ftsFilesDict.iteritems():
      for ftsFile in toUpdateDict.get( category, [] ):
        if ftsFile not in ftsFileList:
          ftsFileList.append( ftsFile )
    return ftsFilesDict

#  def resources( self ):
#    """ resource helper getter """
#    if not self.__resources:
#      self.__resources = Resources()
#    return self.__resources

  def threadPool( self ):
    """ thread pool getter """
    if not self.__threadPool:
      self.__threadPool = ThreadPool( self.MIN_THREADS, self.MAX_THREADS )
      self.__threadPool.daemonize()
    return self.__threadPool


  def resetFTSPlacement( self ):
    """ create fts Placement """

    ftsHistory = self.ftsClient().getFTSHistory()
    if not ftsHistory["OK"]:
      self.log.error( "unable to get FTS history:", ftsHistory["Message"] )
      return ftsHistory
    ftsHistory = ftsHistory["Value"]

    try:
      self.updateLock().acquire()
      if not self.__ftsPlacement:
        self.__ftsPlacement = FTSPlacement( csPath = None, ftsHistoryViews = ftsHistory )
      else:
        self.__ftsPlacement.refresh( ftsHistoryViews = ftsHistory )
    finally:
      self.updateLock().release()

    # # save time stamp
    self.__ftsPlacementValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH )

    return S_OK()

  def initialize( self ):
    """ agent's initialization """


      # # data manager
    self.dataManager = DataManager()

    log = self.log.getSubLogger( "initialize" )

    self.FTSPLACEMENT_REFRESH = self.am_getOption( "FTSPlacementValidityPeriod", self.FTSPLACEMENT_REFRESH )
    log.info( "FTSPlacement validity period       = %s s" % self.FTSPLACEMENT_REFRESH )


    self.SUBMIT_COMMAND = self.am_getOption( "SubmitCommand", self.SUBMIT_COMMAND )
    log.info( "FTS submit command = %s" % self.SUBMIT_COMMAND )
    self.MONITOR_COMMAND = self.am_getOption( "MonitorCommand", self.MONITOR_COMMAND )
    log.info( "FTS commands: submit = %s monitor %s" % ( self.SUBMIT_COMMAND, self.MONITOR_COMMAND ) )
    self.PIN_TIME = self.am_getOption( "PinTime", self.PIN_TIME )
    log.info( "Stage files before submission  = ", {True: "yes", False: "no"}[bool( self.PIN_TIME )] )

    self.MAX_ACTIVE_JOBS = self.am_getOption( "MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS )
    log.info( "Max active FTSJobs/route       = ", str( self.MAX_ACTIVE_JOBS ) )
    self.MAX_FILES_PER_JOB = self.am_getOption( "MaxFilesPerJob", self.MAX_FILES_PER_JOB )
    log.info( "Max FTSFiles/FTSJob            = ", str( self.MAX_FILES_PER_JOB ) )

    self.MAX_ATTEMPT = self.am_getOption( "MaxTransferAttempts", self.MAX_ATTEMPT )
    log.info( "Max transfer attempts          = ", str( self.MAX_ATTEMPT ) )

    # # thread pool
    self.MIN_THREADS = self.am_getOption( "MinThreads", self.MIN_THREADS )
    self.MAX_THREADS = self.am_getOption( "MaxThreads", self.MAX_THREADS )
    minmax = ( abs( self.MIN_THREADS ), abs( self.MAX_THREADS ) )
    self.MIN_THREADS, self.MAX_THREADS = min( minmax ), max( minmax )
    log.info( "ThreadPool min threads         = ", str( self.MIN_THREADS ) )
    log.info( "ThreadPool max threads         = ", str( self.MAX_THREADS ) )

    self.MAX_REQUESTS = self.am_getOption( "MaxRequests", self.MAX_REQUESTS )
    log.info( "Max Requests fetched           = ", str( self.MAX_REQUESTS ) )

    self.MONITORING_INTERVAL = self.am_getOption( "MonitoringInterval", self.MONITORING_INTERVAL )
    log.info( "Minimum monitoring interval    = ", str( self.MONITORING_INTERVAL ) )

    self.__ftsVersion = Operations().getValue( 'DataManagement/FTSVersion', 'FTS2' )
    log.info( "FTSVersion : %s" % self.__ftsVersion )
    log.info( "initialize: creation of FTSPlacement..." )
    createPlacement = self.resetFTSPlacement()
    if not createPlacement["OK"]:
      log.error( "initialize:", createPlacement["Message"] )
      return createPlacement

    # This sets the Default Proxy to used as that defined under
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )
    log.info( "will use DataManager proxy" )

    self.registrationProtocols = getRegistrationProtocols()


    # # gMonitor stuff here
    gMonitor.registerActivity( "RequestsAtt", "Attempted requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RequestsOK", "Successful requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "RequestsFail", "Failed requests executions",
                               "FTSAgent", "Requests/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSJobsSubAtt", "FTSJobs creation attempts",
                               "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsSubOK", "FTSJobs submitted successfully",
                               "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsSubFail", "FTSJobs submissions failed",
                               "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSJobsMonAtt", "FTSJobs monitored",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsMonOK", "FTSJobs monitored successfully",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )
    gMonitor.registerActivity( "FTSJobsMonFail", "FTSJobs attempts failed",
                               "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM )

    gMonitor.registerActivity( "FTSMonitorFail", "Failed FTS monitor executions",
                               "FTSAgent", "Execution/mins", gMonitor.OP_SUM )


    pollingTime = self.am_getOption( "PollingTime", 60 )
    for status in list( FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES ):
      gMonitor.registerActivity( "FTSJobs%s" % status, "FTSJobs %s" % status ,
                                 "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime )

    gMonitor.registerActivity( "FtSJobsPerRequest", "Average FTSJobs per request",
                               "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN )
    gMonitor.registerActivity( "FTSFilesPerJob", "FTSFiles per FTSJob",
                               "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN )
    gMonitor.registerActivity( "FTSSizePerJob", "Average FTSFiles size per FTSJob",
                               "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN )
    return S_OK()

  def finalize( self ):
    """ finalize processing """
    # log = self.log.getSubLogger( "finalize" )
    # if self.__reqCache:
    #  log.info( 'putting back %d requests from cache' % len( self.__reqCache ) )
    # else:
    #  log.info( 'no requests to put back' )
    # for request in self.__reqCache.values():
    #  put = self.requestClient().putRequest( request )
    #  if not put["OK"]:
    #    log.error( "unable to put back request '%s': %s" % ( request.RequestName, put["Message"] ) )
    return S_OK()

  def execute( self ):
    """ one cycle execution """

    # Don't use the server certificate otherwise the DFC wont let us write
    gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' )


    log = gLogger.getSubLogger( "execute" )
    # # reset FTSPlacement if expired
    now = datetime.datetime.now()
    if now > self.__ftsPlacementValidStamp:
      log.info( "resetting expired FTS placement..." )
      resetFTSPlacement = self.resetFTSPlacement()
      if not resetFTSPlacement["OK"]:
        log.error( "FTSPlacement recreation error:" , resetFTSPlacement["Message"] )
        return resetFTSPlacement
      self.__ftsPlacementValidStamp = now + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH )

    requestIDs = self.requestClient().getRequestIDsList( statusList = [ "Scheduled" ], limit = self.MAX_REQUESTS )
    if not requestIDs["OK"]:
      log.error( "unable to read scheduled request ids" , requestIDs["Message"] )
      return requestIDs
    if not requestIDs["Value"]:
      requestIDs = []
    else:
      requestIDs = [ req[0] for req in requestIDs["Value"] if req[0] not in self.__reqCache ]
    requestIDs += self.__reqCache.keys()

    if not requestIDs:
      log.info( "no 'Scheduled' requests to process" )
      return S_OK()

    log.info( "found %s requests to process:" % len( requestIDs ) )
    log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) )
    log.info( " =>   new read from RMS: %s" % ( len( requestIDs ) - len( self.__reqCache ) ) )

    for requestID in requestIDs:
      request = self.getRequest( requestID )
      if not request["OK"]:
        log.error( "Error getting request", "%s: %s" % ( requestID, request["Message"] ) )
        continue
      request = request["Value"]
      sTJId = request.RequestID
      while True:
        queue = self.threadPool().generateJobAndQueueIt( self.processRequest,
                                                         args = ( request, ),
                                                         sTJId = sTJId )
        if queue["OK"]:
          log.info( "Request enqueued for execution", sTJId )
          gMonitor.addMark( "RequestsAtt", 1 )
          break
        time.sleep( 1 )

    # # process all results
    self.threadPool().processAllResults()
    return S_OK()

  def processRequest( self, request ):
    """ process one request

    :param Request request: ReqDB.Request
    """
    log = self.log.getSubLogger( "req_%s/%s" % ( request.RequestID, request.RequestName ) )

    operation = request.getWaiting()
    if not operation["OK"]:
      log.error( "Unable to find 'Scheduled' ReplicateAndRegister operation in request" )
      return self.putRequest( request )
    operation = operation["Value"]
    if not isinstance( operation, Operation ):
      log.error( "Waiting returned operation is not an operation:", type( operation ) )
      return self.putRequest( request )
    if operation.Type != "ReplicateAndRegister":
      log.error( "operation to be executed is not a ReplicateAndRegister but", operation.Type )
      return self.putRequest( request )
    if operation.Status != "Scheduled":
      log.error( "operation in a wrong state, expecting 'Scheduled', got", operation.Status )
      return self.putRequest( request )

    log.info( 'start processRequest' )
    # # select  FTSJobs, by default all in TRANS_STATES and INIT_STATES
    ftsJobs = self.ftsClient().getFTSJobsForRequest( request.RequestID )
    if not ftsJobs["OK"]:
      log.error( ftsJobs["Message"] )
      return ftsJobs
    ftsJobs = [ftsJob for ftsJob in ftsJobs.get( "Value", [] ) if ftsJob.Status not in FTSJob.FINALSTATES]

    # # Use a try: finally: for making sure FTS jobs are put back before returning
    try:
      # # dict keeping info about files to reschedule, submit, fail and register
      ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] )

      now = datetime.datetime.utcnow()
      jobsToMonitor = [job for job in ftsJobs if ( now - job.LastUpdate ).seconds > ( self.MONITORING_INTERVAL * ( 3. if job.Status == 'Staging' else 1. ) )]
      if jobsToMonitor:
        log.info( "==> found %s FTSJobs to monitor" % len( jobsToMonitor ) )
        # # PHASE 0 = monitor active FTSJobs
        for ftsJob in jobsToMonitor:
          monitor = self.__monitorJob( request, ftsJob )
          if not monitor["OK"]:
            log.error( "unable to monitor FTSJob", "%s: %s" % ( ftsJob.FTSJobID, monitor["Message"] ) )
            ftsJob.Status = "Submitted"
          else:
            ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, monitor["Value"] )

        log.info( "monitoring of FTSJobs completed" )
        for key, ftsFiles in ftsFilesDict.iteritems():
          if ftsFiles:
            log.info( " => %d FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) )
      if len( ftsJobs ) != len( jobsToMonitor ):
        log.info( "==> found %d FTSJobs that were monitored recently" % ( len( ftsJobs ) - len( jobsToMonitor ) ) )

      # # PHASE ONE - check ready replicas
      missingReplicas = self.__checkReadyReplicas( request, operation )
      if not missingReplicas["OK"]:
        log.error( missingReplicas["Message"] )
      else:
        missingReplicas = missingReplicas["Value"]
        for opFile in operation:
          # Actually the condition below should never happen... Change printout for checking
          if opFile.LFN not in missingReplicas and opFile.Status not in ( 'Done', 'Failed' ):
            log.warn( "File should be set Done! %s is replicated at all targets" % opFile.LFN )
            opFile.Status = "Done"

        if missingReplicas:
          # Check if these files are in the FTSDB
          ftsFiles = self.ftsClient().getAllFTSFilesForRequest( request.RequestID )
          if not ftsFiles['OK']:
            log.error( ftsFiles['Message'] )
          else:
            ftsFiles = ftsFiles['Value']
            ftsLfns = set( [ftsFile.LFN for ftsFile in ftsFiles] )
            # Recover files not in FTSDB
            toSchedule = set( missingReplicas ) - ftsLfns
            if toSchedule:
              log.warn( '%d files in operation are not in FTSDB, reset them Waiting' % len( toSchedule ) )
              for opFile in operation:
                if opFile.LFN in toSchedule and opFile.Status == 'Scheduled':
                  opFile.Status = 'Waiting'
            # Recover files with target not in FTSDB
            toSchedule = set( [missing for missing, missingSEs in missingReplicas.iteritems()
                              if not [ftsFile for ftsFile in ftsFiles
                                      if ftsFile.LFN == missing and ftsFile.TargetSE in missingSEs]] )
            if toSchedule:
              log.warn( '%d targets in operation are not in FTSDB, reset files Waiting' % len( toSchedule ) )
              for opFile in operation:
                if opFile.LFN in toSchedule and opFile.Status == 'Scheduled':
                  opFile.Status = 'Waiting'
            # identify missing LFNs that are waiting for a replication which is finished
            for ftsFile in [f for f in ftsFiles if f.LFN in missingReplicas and f.Status.startswith( 'Waiting#' )]:
              targetSE = ftsFile.Status.split( '#' )[1]
              finishedFiles = [f for f in ftsFiles if
                               f.LFN == ftsFile.LFN and
                               f.Status == 'Finished' and
                               f.TargetSE == targetSE and
                               f not in ftsFilesDict['toUpdate']]
              if finishedFiles:
                log.warn( "%s is %s while replication was Finished to %s, update" % ( ftsFile.LFN, ftsFile.Status, targetSE ) )
                ftsFilesDict['toUpdate'] += finishedFiles
            # identify Active transfers for which there is no FTS job any longer and reschedule them
            for ftsFile in [f for f in ftsFiles if f.Status == 'Active' and f.TargetSE in missingReplicas.get( f.LFN, [] )]:
              if not [ftsJob for ftsJob in ftsJobs if ftsJob.FTSGUID == ftsFile.FTSGUID]:
                ftsFilesDict['toReschedule'].append( ftsFile )
            # identify Finished transfer for which the replica is still missing
            for ftsFile in [f for f in ftsFiles if f.Status == 'Finished' and f.TargetSE in missingReplicas.get( f.LFN, [] ) and f not in ftsFilesDict['toRegister'] ]:
              # Check if there is a registration operation for that file and that target
              regOp = [op for op in request if
                       op.Type == 'RegisterReplica' and
                       op.TargetSE == ftsFile.TargetSE and
                       [f for f in op if f.LFN == ftsFile.LFN]]
              if not regOp:
                ftsFilesDict['toReschedule'].append( ftsFile )

            # Recover files that are Failed but were not spotted
            for ftsFile in [f for f in ftsFiles if f.Status == 'Failed' and f.TargetSE in missingReplicas.get( f.LFN, [] )]:
              reschedule, submit, fail = self.__checkFailed( ftsFile )
              if fail and ftsFile not in ftsFilesDict['toFail']:
                ftsFilesDict['toFail'].append( ftsFile )
              elif reschedule and ftsFile not in ftsFilesDict['toReschedule']:
                ftsFilesDict['toReschedule'].append( ftsFile )
              elif submit and ftsFile not in ftsFilesDict['toSubmit']:
                ftsFilesDict['toSubmit'].append( ftsFile )

            # If all transfers are finished for unregistered files and there is already a registration operation, set it Done
            for lfn in missingReplicas:
              if not [f for f in ftsFiles if f.LFN == lfn and ( f.Status != 'Finished' or f in ftsFilesDict['toReschedule'] or f in ftsFilesDict['toRegister'] )]:
                for opFile in operation:
                  if opFile.LFN == lfn:
                    opFile.Status = 'Done'
                    break
          for key, ftsFiles in ftsFilesDict.iteritems():
            if ftsFiles:
              log.info( " => %d FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) )

      toFail = ftsFilesDict.get( "toFail", [] )
      toReschedule = ftsFilesDict.get( "toReschedule", [] )
      toSubmit = ftsFilesDict.get( "toSubmit", [] )
      toRegister = ftsFilesDict.get( "toRegister", [] )
      toUpdate = ftsFilesDict.get( "toUpdate", [] )

      # # PHASE TWO = Failed files? -> make request Failed and return
      if toFail:
        log.error( "==> found %d 'Failed' FTSFiles, but maybe other files can be processed..." % len( toFail ) )
        for opFile in operation:
          for ftsFile in toFail:
            if opFile.FileID == ftsFile.FileID:
              opFile.Error = ftsFile.Error
              opFile.Status = "Failed"
        operation.Error = "%s files are missing any replicas" % len( toFail )
        # # requets.Status should be Failed if all files in the operation "Failed"
        if request.Status == "Failed":
          request.Error = "ReplicateAndRegister %s failed" % operation.Order
          log.error( "request is set to 'Failed'" )
          # # putRequest is done by the finally: clause... Not good to do it twice
          raise escapeTry

      # # PHASE THREE - update Waiting#TargetSE FTSFiles
      if toUpdate:
        log.info( "==> found %s possible FTSFiles to update..." % ( len( toUpdate ) ) )
        byTarget = {}
        for ftsFile in toUpdate:
          byTarget.setdefault( ftsFile.TargetSE, [] ).append( ftsFile.FileID )
        for targetSE, fileIDList in byTarget.iteritems():
          update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList )
          if not update["OK"]:
            log.error( "update FTSFiles failed:", update["Message"] )

      # # PHASE FOUR - add 'RegisterReplica' Operations
      if toRegister:
        log.info( "==> found %d Files waiting for registration, adding 'RegisterReplica' operations" % len( toRegister ) )
        registerFiles = self.__insertRegisterOperation( request, operation, toRegister )
        if not registerFiles["OK"]:
          log.error( "unable to create 'RegisterReplica' operations:", registerFiles["Message"] )
        # if request.Status == "Waiting":
        #  log.info( "request is in 'Waiting' state, will put it back to RMS" )
        #  return self.putRequest( request )

      # # PHASE FIVE - reschedule operation files
      if toReschedule:
        log.info( "==> found %s Files to reschedule" % len( toReschedule ) )
        rescheduleFiles = self.__reschedule( request, operation, toReschedule )
        if not rescheduleFiles["OK"]:
          log.error( 'Failed to reschedule files', rescheduleFiles["Message"] )

      # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs. We get also Failed files to recover them if needed
      ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, [ "Waiting", "Failed", 'Submitted', 'Canceled' ] )
      if not ftsFiles["OK"]:
        log.error( ftsFiles["Message"] )
      else:
        retryIds = set ( [ ftsFile.FTSFileID for ftsFile in toSubmit ] )
        for ftsFile in ftsFiles["Value"]:
          if ftsFile.FTSFileID not in retryIds:
            if ftsFile.Status in ( 'Failed', 'Canceled' ):
              # If the file was not unrecoverable failed and is not yet set toSubmit
              _reschedule, submit, _fail = self.__checkFailed( ftsFile )
            elif ftsFile.Status == 'Submitted':
              if ftsFile.FTSGUID not in [job.FTSGUID for job in ftsJobs]:
                log.warn( 'FTS GUID %s not found in FTS jobs, resubmit file transfer' % ftsFile.FTSGUID )
                ftsFile.Status = 'Waiting'
                submit = True
              else:
                submit = False
            else:
              submit = True
            if submit:
              toSubmit.append( ftsFile )
              retryIds.add( ftsFile.FTSFileID )

      # # should not put back jobs that have not been monitored this time
      ftsJobs = jobsToMonitor
      # # submit new ftsJobs
      if toSubmit:
        if request.Status != 'Scheduled':
          log.info( "Found %d FTSFiles to submit while request is no longer in Scheduled status (%s)" \
                    % ( len( toSubmit ), request.Status ) )
        else:
          self.__checkDuplicates( request.RequestID, toSubmit )
          log.info( "==> found %s FTSFiles to submit" % len( toSubmit ) )
          submit = self.__submit( request, operation, toSubmit )
          if not submit["OK"]:
            log.error( submit["Message"] )
          else:
            ftsJobs += submit["Value"]

      # # status change? - put back request
      if request.Status != "Scheduled":
        log.info( "request no longer in 'Scheduled' state (%s), will put it back to RMS" % request.Status )

    except escapeTry:
      # This clause is raised when one wants to return from within the try: clause
      pass
    except Exception, exceptMessage:
      log.exception( "Exception in processRequest", lException = exceptMessage )
    finally:
コード例 #22
0
ファイル: GatewayService.py プロジェクト: vingar/DIRAC
class GatewayService(Service):
    """ Inherits from Service so it can (and should) be run as a DIRAC service,
      but replaces several of the internal methods
  """

    GATEWAY_NAME = "Framework/Gateway"

    def __init__(self):
        """ Initialize like a real service
    """
        super(GatewayService, self).__init__({
            'modName':
            GatewayService.GATEWAY_NAME,
            'loadName':
            GatewayService.GATEWAY_NAME,
            'standalone':
            True,
            'moduleObj':
            sys.modules[DIRAC.Core.DISET.private.GatewayService.GatewayService.
                        __module__],
            'classObj':
            self.__class__
        })
        self.__delegatedCredentials = DictCache()
        self.__transferBytesLimit = 1024 * 1024 * 100
        # to be resolved
        self._url = None
        self._handler = None
        self._threadPool = None
        self._msgBroker = None
        self._msgForwarder = None

    def initialize(self):
        """ This replaces the standard initialize from Service
    """
        #Build the URLs
        self._url = self._cfg.getURL()
        if not self._url:
            return S_ERROR("Could not build service URL for %s" %
                           GatewayService.GATEWAY_NAME)
        gLogger.verbose("Service URL is %s" % self._url)
        #Load handler
        result = self._loadHandlerInit()
        if not result['OK']:
            return result
        self._handler = result['Value']
        #Discover Handler
        self._threadPool = ThreadPool(1, max(0, self._cfg.getMaxThreads()),
                                      self._cfg.getMaxWaitingPetitions())
        self._threadPool.daemonize()
        self._msgBroker = MessageBroker("%sMSB" % GatewayService.GATEWAY_NAME,
                                        threadPool=self._threadPool)
        self._msgBroker.useMessageObjects(False)
        getGlobalMessageBroker().useMessageObjects(False)
        self._msgForwarder = MessageForwarder(self._msgBroker)
        return S_OK()

    def _processInThread(self, clientTransport):
        """ Threaded process function
    """
        #Handshake
        try:
            clientTransport.handshake()
        except:
            return
        #Add to the transport pool
        trid = self._transportPool.add(clientTransport)
        if not trid:
            return
        #Receive and check proposal
        result = self._receiveAndCheckProposal(trid)
        if not result['OK']:
            self._transportPool.sendAndClose(trid, result)
            return
        proposalTuple = result['Value']
        #Instantiate handler
        result = self.__getClientInitArgs(trid, proposalTuple)
        if not result['OK']:
            self._transportPool.sendAndClose(trid, result)
            return
        clientInitArgs = result['Value']
        #Execute the action
        result = self._processProposal(trid, proposalTuple, clientInitArgs)
        #Close the connection if required
        if result['closeTransport']:
            self._transportPool.close(trid)
        return result

    def _receiveAndCheckProposal(self, trid):
        clientTransport = self._transportPool.get(trid)
        #Get the peer credentials
        credDict = clientTransport.getConnectingCredentials()
        #Receive the action proposal
        retVal = clientTransport.receiveData(1024)
        if not retVal['OK']:
            gLogger.error(
                "Invalid action proposal",
                "%s %s" % (self._createIdentityString(
                    credDict, clientTransport), retVal['Message']))
            return S_ERROR("Invalid action proposal")
        proposalTuple = retVal['Value']
        gLogger.debug("Received action from client",
                      "/".join(list(proposalTuple[1])))
        #Check if there are extra credentials
        if proposalTuple[2]:
            clientTransport.setExtraCredentials(proposalTuple[2])
        return S_OK(proposalTuple)

    def __getClientInitArgs(self, trid, proposalTuple):
        clientTransport = self._transportPool.get(trid)
        #Get the peer credentials
        credDict = clientTransport.getConnectingCredentials()
        if 'x509Chain' not in credDict:
            return S_OK()
        cKey = (credDict['DN'], credDict.get('group', False),
                credDict.get('extraCredentials',
                             False), credDict['isLimitedProxy'])
        dP = self.__delegatedCredentials.get(cKey, 3600)
        idString = self._createIdentityString(credDict, clientTransport)
        if dP:
            gLogger.verbose("Proxy for %s is cached" % idString)
            return S_OK(dP)
        result = self.__requestDelegation(clientTransport, credDict)
        if not result['OK']:
            gLogger.warn("Could not get proxy for %s: %s" %
                         (idString, result['Message']))
            return result
        delChain = result['Value']
        delegatedChain = delChain.dumpAllToString()['Value']
        secsLeft = delChain.getRemainingSecs()['Value'] - 1
        clientInitArgs = {
            BaseClient.KW_SETUP: proposalTuple[0][1],
            BaseClient.KW_TIMEOUT: 600,
            BaseClient.KW_IGNORE_GATEWAYS: True,
            BaseClient.KW_USE_CERTIFICATES: False,
            BaseClient.KW_PROXY_STRING: delegatedChain
        }
        if BaseClient.KW_EXTRA_CREDENTIALS in credDict:
            clientInitArgs[BaseClient.KW_EXTRA_CREDENTIALS] = credDict[
                BaseClient.KW_EXTRA_CREDENTIALS]
        gLogger.warn("Got delegated proxy for %s: %s secs left" %
                     (idString, secsLeft))
        self.__delegatedCredentials.add(cKey, secsLeft, clientInitArgs)
        return S_OK(clientInitArgs)

    def __requestDelegation(self, clientTransport, credDict):
        peerChain = credDict['x509Chain']
        retVal = peerChain.getCertInChain()['Value'].generateProxyRequest()
        if not retVal['OK']:
            return retVal
        delegationRequest = retVal['Value']
        retVal = delegationRequest.dumpRequest()
        if not retVal['OK']:
            retVal = S_ERROR("Server Error: Can't generate delegation request")
            clientTransport.sendData(retVal)
            return retVal
        gLogger.info("Sending delegation request for %s" %
                     delegationRequest.getSubjectDN()['Value'])
        clientTransport.sendData(S_OK({'delegate': retVal['Value']}))
        delegatedCertChain = clientTransport.receiveData()
        delegatedChain = X509Chain(keyObj=delegationRequest.getPKey())
        retVal = delegatedChain.loadChainFromString(delegatedCertChain)
        if not retVal['OK']:
            retVal = S_ERROR("Error in receiving delegated proxy: %s" %
                             retVal['Message'])
            clientTransport.sendData(retVal)
            return retVal
        return S_OK(delegatedChain)

    #Msg

    def _mbConnect(self, trid, handlerObj=None):
        return S_OK()

    def _mbReceivedMsg(self, cliTrid, msgObj):
        return self._msgForwarder.msgFromClient(cliTrid, msgObj)

    def _mbDisconnect(self, cliTrid):
        self._msgForwarder.cliDisconnect(cliTrid)

    #Execute action

    def _executeAction(self, trid, proposalTuple, clientInitArgs):
        clientTransport = self._transportPool.get(trid)
        credDict = clientTransport.getConnectingCredentials()
        targetService = proposalTuple[0][0]
        actionType = proposalTuple[1][0]
        actionMethod = proposalTuple[1][1]
        idString = self._createIdentityString(credDict, clientTransport)
        #OOkay! Lets do the magic!
        retVal = clientTransport.receiveData()
        if not retVal['OK']:
            gLogger.error("Error while receiving file description",
                          retVal['Message'])
            clientTransport.sendData(
                S_ERROR("Error while receiving file description: %s" %
                        retVal['Message']))
            return
        if actionType == "FileTransfer":
            gLogger.warn("Received a file transfer action from %s" % idString)
            clientTransport.sendData(S_OK("Accepted"))
            retVal = self.__forwardFileTransferCall(targetService,
                                                    clientInitArgs,
                                                    actionMethod,
                                                    retVal['Value'],
                                                    clientTransport)
        elif actionType == "RPC":
            gLogger.info("Forwarding %s/%s action to %s for %s" %
                         (actionType, actionMethod, targetService, idString))
            retVal = self.__forwardRPCCall(targetService, clientInitArgs,
                                           actionMethod, retVal['Value'])
        elif actionType == "Connection" and actionMethod == "new":
            gLogger.info("Initiating a messaging connection to %s for %s" %
                         (targetService, idString))
            retVal = self._msgForwarder.addClient(trid, targetService,
                                                  clientInitArgs,
                                                  retVal['Value'])
        else:
            gLogger.warn("Received an invalid %s/%s action from %s" %
                         (actionType, actionMethod, idString))
            retVal = S_ERROR("Unknown type of action (%s)" % actionType)
        #TODO: Send back the data?
        if 'rpcStub' in retVal:
            retVal.pop('rpcStub')
        clientTransport.sendData(retVal)
        return retVal

    def __forwardRPCCall(self, targetService, clientInitArgs, method, params):
        if targetService == "Configuration/Server":
            if method == "getCompressedDataIfNewer":
                #Relay CS data directly
                serviceVersion = gConfigurationData.getVersion()
                retDict = {'newestVersion': serviceVersion}
                clientVersion = params[0]
                if clientVersion < serviceVersion:
                    retDict['data'] = gConfigurationData.getCompressedData()
                return S_OK(retDict)
        #Default
        rpcClient = RPCClient(targetService, **clientInitArgs)
        methodObj = getattr(rpcClient, method)
        return methodObj(*params)

    def __forwardFileTransferCall(self, targetService, clientInitArgs, method,
                                  params, clientTransport):
        transferRelay = TransferRelay(targetService, **clientInitArgs)
        transferRelay.setTransferLimit(self.__transferBytesLimit)
        cliFH = FileHelper(clientTransport)
        #Check file size
        if method.find("ToClient") > -1:
            cliFH.setDirection("send")
        elif method.find("FromClient") > -1:
            cliFH.setDirection("receive")
            if not self.__ftCheckMaxTransferSize(params[2]):
                cliFH.markAsTransferred()
                return S_ERROR("Transfer size is too big")
        #Forward queries
        try:
            relayMethodObject = getattr(transferRelay, 'forward%s' % method)
        except:
            return S_ERROR("Cannot forward unknown method %s" % method)
        result = relayMethodObject(cliFH, params)
        return result

    def __ftCheckMaxTransferSize(self, requestedTransferSize):
        if not self.__transferBytesLimit:
            return True
        if not requestedTransferSize:
            return True
        if requestedTransferSize <= self.__transferBytesLimit:
            return True
        return False
コード例 #23
0
ファイル: Service.py プロジェクト: sposs/DIRAC
class Service:

    SVC_VALID_ACTIONS = {
        'RPC': 'export',
        'FileTransfer': 'transfer',
        'Message': 'msg',
        'Connection': 'Message'
    }
    SVC_SECLOG_CLIENT = SecurityLogClient()

    def __init__(self, serviceData):
        self._svcData = serviceData
        self._name = serviceData['loadName']
        self._startTime = Time.dateTime()
        self._validNames = [serviceData['modName']]
        if serviceData['loadName'] not in self._validNames:
            self._validNames.append(serviceData['loadName'])
        self._cfg = ServiceConfiguration(list(self._validNames))
        if serviceData['standalone']:
            self._monitor = gMonitor
        else:
            self._monitor = MonitoringClient()
        self.__monitorLastStatsUpdate = time.time()
        self._stats = {'queries': 0, 'connections': 0}
        self._authMgr = AuthManager(
            "%s/Authorization" %
            PathFinder.getServiceSection(serviceData['loadName']))
        self._transportPool = getGlobalTransportPool()
        self.__cloneId = 0
        self.__maxFD = 0

    def setCloneProcessId(self, cloneId):
        self.__cloneId = cloneId
        self._monitor.setComponentName("%s-Clone:%s" % (self._name, cloneId))

    def _isMetaAction(self, action):
        referedAction = Service.SVC_VALID_ACTIONS[action]
        if referedAction in Service.SVC_VALID_ACTIONS:
            return referedAction
        return False

    def initialize(self):
        #Build the URLs
        self._url = self._cfg.getURL()
        if not self._url:
            return S_ERROR("Could not build service URL for %s" % self._name)
        gLogger.verbose("Service URL is %s" % self._url)
        #Load handler
        result = self._loadHandlerInit()
        if not result['OK']:
            return result
        self._handler = result['Value']
        #Initialize lock manager
        self._lockManager = LockManager(self._cfg.getMaxWaitingPetitions())
        self._initMonitoring()
        self._threadPool = ThreadPool(1, max(0, self._cfg.getMaxThreads()),
                                      self._cfg.getMaxWaitingPetitions())
        self._threadPool.daemonize()
        self._msgBroker = MessageBroker("%sMSB" % self._name,
                                        threadPool=self._threadPool)
        #Create static dict
        self._serviceInfoDict = {
            'serviceName':
            self._name,
            'serviceSectionPath':
            PathFinder.getServiceSection(self._name),
            'URL':
            self._cfg.getURL(),
            'messageSender':
            MessageSender(self._name, self._msgBroker),
            'validNames':
            self._validNames,
            'csPaths': [
                PathFinder.getServiceSection(svcName)
                for svcName in self._validNames
            ]
        }
        #Call static initialization function
        try:
            self._handler['class']._rh__initializeClass(
                dict(self._serviceInfoDict), self._lockManager,
                self._msgBroker, self._monitor)
            if self._handler['init']:
                for initFunc in self._handler['init']:
                    gLogger.verbose("Executing initialization function")
                    try:
                        result = initFunc(dict(self._serviceInfoDict))
                    except Exception, excp:
                        gLogger.exception(
                            "Exception while calling initialization function")
                        return S_ERROR(
                            "Exception while calling initialization function: %s"
                            % str(excp))
                    if not isReturnStructure(result):
                        return S_ERROR(
                            "Service initialization function %s must return S_OK/S_ERROR"
                            % initFunc)
                    if not result['OK']:
                        return S_ERROR("Error while initializing %s: %s" %
                                       (self._name, result['Message']))
        except Exception, e:
            errMsg = "Exception while initializing %s" % self._name
            gLogger.exception(errMsg)
            return S_ERROR(errMsg)

        #Load actions after the handler has initialized itself
        result = self._loadActions()
        if not result['OK']:
            return result
        self._actions = result['Value']

        gThreadScheduler.addPeriodicTask(30, self.__reportThreadPoolContents)

        return S_OK()
コード例 #24
0
ファイル: Service.py プロジェクト: zhangxiaomei/DIRAC
class Service:

  SVC_VALID_ACTIONS = { 'RPC' : 'export',
                        'FileTransfer': 'transfer',
                        'Message' : 'msg',
                        'Connection' : 'Message' }
  SVC_SECLOG_CLIENT = SecurityLogClient()

  def __init__( self, serviceName ):
    self._name = serviceName
    self._startTime = Time.dateTime()
    self._cfg = ServiceConfiguration( serviceName )
    self._validNames = [ self._name ]
    self._monitor = MonitoringClient()
    self.__monitorLastStatsUpdate = time.time()
    self._stats = { 'queries' : 0, 'connections' : 0 }
    self._authMgr = AuthManager( "%s/Authorization" % self._cfg.getServicePath() )
    self._transportPool = getGlobalTransportPool()
    self.__cloneId = 0

  def setCloneProcessId( self, cloneId ):
    self.__cloneId = cloneId
    self._monitor.setComponentName( "%s-Clone:%s" % ( self._name, cloneId ) )

  def _isMetaAction( self, action ):
    referedAction = Service.SVC_VALID_ACTIONS[ action ]
    if referedAction in Service.SVC_VALID_ACTIONS:
      return referedAction
    return False

  def initialize( self ):
    #Build the URLs
    self._url = self._cfg.getURL()
    if not self._url:
      return S_ERROR( "Could not build service URL for %s" % self._name )
    gLogger.verbose( "Service URL is %s" % self._url )
    #Discover Handler
    self._handlerLocation = self._discoverHandlerLocation()
    if not self._handlerLocation:
      return S_ERROR( "Could not find handler location for %s" % self._name )
    gLogger.verbose( "Handler found at %s" % self._handlerLocation )
    #Load handler
    result = self._loadHandler()
    if not result[ 'OK' ]:
      return result
    self._handler = result[ 'Value' ]
    #Initialize lock manager
    self._lockManager = LockManager( self._cfg.getMaxWaitingPetitions() )
    #Load actions
    result = self._loadActions()
    if not result[ 'OK' ]:
      return result
    self._actions = result[ 'Value' ]
    self._initMonitoring()
    self._threadPool = ThreadPool( 1,
                                    max( 0, self._cfg.getMaxThreads() ),
                                    self._cfg.getMaxWaitingPetitions() )
    self._threadPool.daemonize()
    self._msgBroker = MessageBroker( "%sMSB" % self._name, threadPool = self._threadPool )
    #Create static dict
    self._serviceInfoDict = { 'serviceName' : self._name,
                               'URL' : self._cfg.getURL(),
                               'systemSectionPath' : self._cfg.getSystemPath(),
                               'serviceSectionPath' : self._cfg.getServicePath(),
                               'messageSender' : MessageSender( self._msgBroker )
                             }
    #Call static initialization function
    try:
      if self._handler[ 'init' ]:
        result = self._handler[ 'init' ]( dict( self._serviceInfoDict ) )
        if not isReturnStructure( result ):
          return S_ERROR( "Service initialization function must return S_OK/S_ERROR" )
        if not result[ 'OK' ]:
          return S_ERROR( "Error while initializing %s: %s" % ( self._name, result[ 'Message' ] ) )
    except Exception, e:
      errMsg = "Exception while intializing %s" % self._name
      gLogger.exception( errMsg )
      return S_ERROR( errMsg )

    gThreadScheduler.addPeriodicTask( 30, self.__reportThreadPoolContents )

    return S_OK()