Ejemplo n.º 1
0
def sendNumTaskQueues():
    result = gTaskQueueDB.getNumTaskQueues()
    if result['OK']:
        gMonitor.addMark('numTQs', result['Value'])
    else:
        gLogger.error("Cannot get the number of task queues",
                      result['Message'])
Ejemplo n.º 2
0
  def __checkReplicas( self ):
    """ check done replicas and update file states  """
    waitingFiles = dict( [ ( opFile.LFN, opFile ) for opFile in self.operation
                          if opFile.Status in ( "Waiting", "Scheduled" ) ] )
    targetSESet = set( self.operation.targetSEList )

    replicas = self.fc.getReplicas( waitingFiles.keys() )
    if not replicas["OK"]:
      self.log.error( 'Failed to get replicas', replicas["Message"] )
      return replicas

    reMissing = re.compile( r".*such file.*" )
    for failedLFN, errStr in replicas["Value"]["Failed"].items():
      waitingFiles[failedLFN].Error = errStr
      if reMissing.search( errStr.lower() ):
        self.log.error( "File does not exists", failedLFN )
        gMonitor.addMark( "ReplicateFail", len( targetSESet ) )
        waitingFiles[failedLFN].Status = "Failed"

    for successfulLFN, reps in replicas["Value"]["Successful"].items():
      if targetSESet.issubset( set( reps ) ):
        self.log.info( "file %s has been replicated to all targets" % successfulLFN )
        waitingFiles[successfulLFN].Status = "Done"

    return S_OK()
Ejemplo n.º 3
0
class MatcherHandler(RequestHandler):
    def initialize(self):
        self.limiter = Limiter(jobDB=gJobDB)

##############################################################################

    types_requestJob = [[StringType, DictType]]

    def export_requestJob(self, resourceDescription):
        """ Serve a job to the request of an agent which is the highest priority
        one matching the agent's site capacity
    """

        resourceDescription['Setup'] = self.serviceInfoDict['clientSetup']
        credDict = self.getRemoteCredentials()

        try:
            opsHelper = Operations(group=credDict['group'])
            matcher = Matcher(pilotAgentsDB=pilotAgentsDB,
                              jobDB=gJobDB,
                              tqDB=gTaskQueueDB,
                              jlDB=jlDB,
                              opsHelper=opsHelper)
            result = matcher.selectJob(resourceDescription, credDict)
        except RuntimeError, rte:
            self.log.error("Error requesting job: ", rte)
            return S_ERROR("Error requesting job")
        gMonitor.addMark("matchesDone")
        gMonitor.addMark("matchesOK")
        return S_OK(result)
Ejemplo n.º 4
0
    def export_requestJob(self, resourceDescription):
        """ Serve a job to the request of an agent which is the highest priority
        one matching the agent's site capacity
    """

        resourceDescription['Setup'] = self.serviceInfoDict['clientSetup']
        credDict = self.getRemoteCredentials()

        try:
            opsHelper = Operations(group=credDict['group'])
            matcher = Matcher(pilotAgentsDB=pilotAgentsDB,
                              jobDB=gJobDB,
                              tqDB=gTaskQueueDB,
                              jlDB=jlDB,
                              opsHelper=opsHelper)
            result = matcher.selectJob(resourceDescription, credDict)
        except RuntimeError as rte:
            self.log.error("Error requesting job: ", rte)
            return S_ERROR("Error requesting job")

        # result can be empty, meaning that no job matched
        if result:
            gMonitor.addMark("matchesDone")
            gMonitor.addMark("matchesOK")
            return S_OK(result)
        # FIXME: This is correctly interpreted by the JobAgent, but DErrno should be used instead
        return S_ERROR("No match found")
Ejemplo n.º 5
0
  def export_putRequest( self, requestJSON ):
    """ forward request from local RequestDB to central RequestManager

    :param self: self reference
    :param str requestType: request type
    """

    gMonitor.addMark( 'reqReceived', 1 )

    requestDict = json.loads( requestJSON )
    requestName = requestDict.get( "RequestID", requestDict.get( 'RequestName', "***UNKNOWN***" ) )
    gLogger.info( "putRequest: got request '%s'" % requestName )

    forwardable = self.__forwardable( requestDict )
    if not forwardable["OK"]:
      gLogger.warn( "putRequest: %s" % forwardable["Message"] )


    setRequest = self.requestManager().putRequest( requestJSON )
    if not setRequest["OK"]:
      gLogger.error( "setReqeuest: unable to set request '%s' @ RequestManager: %s" % ( requestName,
                                                                                        setRequest["Message"] ) )
      # # put request to the request file cache
      save = self.__saveRequest( requestName, requestJSON )
      if not save["OK"]:
        gLogger.error( "setRequest: unable to save request to the cache: %s" % save["Message"] )
        return save
      gLogger.info( "setRequest: %s is saved to %s file" % ( requestName, save["Value"] ) )
      return S_OK( { "set" : False, "saved" : True } )

    gLogger.info( "setRequest: request '%s' has been set to the ReqManager" % ( requestName ) )
    return S_OK( { "set" : True, "saved" : False } )
Ejemplo n.º 6
0
  def export_requestJob(self, resourceDescription):
    """ Serve a job to the request of an agent which is the highest priority
        one matching the agent's site capacity
    """

    resourceDescription['Setup'] = self.serviceInfoDict['clientSetup']
    credDict = self.getRemoteCredentials()

    try:
      opsHelper = Operations(group=credDict['group'])
      matcher = Matcher(pilotAgentsDB=pilotAgentsDB,
                        jobDB=gJobDB,
                        tqDB=gTaskQueueDB,
                        jlDB=jlDB,
                        opsHelper=opsHelper)
      result = matcher.selectJob(resourceDescription, credDict)
    except RuntimeError as rte:
      self.log.error("Error requesting job: ", rte)
      return S_ERROR("Error requesting job")

    # result can be empty, meaning that no job matched
    if result:
      gMonitor.addMark("matchesDone")
      gMonitor.addMark("matchesOK")
      return S_OK(result)
    # FIXME: This is correctly interpreted by the JobAgent, but DErrno should be used instead
    return S_ERROR("No match found")
Ejemplo n.º 7
0
    def export_requestJob(self, resourceDescription):
        """Serve a job to the request of an agent which is the highest priority
        one matching the agent's site capacity
        """

        resourceDescription["Setup"] = self.serviceInfoDict["clientSetup"]
        credDict = self.getRemoteCredentials()
        pilotRef = resourceDescription.get("PilotReference", "Unknown")

        try:
            opsHelper = Operations(group=credDict["group"])
            matcher = Matcher(
                pilotAgentsDB=self.pilotAgentsDB,
                jobDB=self.jobDB,
                tqDB=self.taskQueueDB,
                jlDB=self.jobLoggingDB,
                opsHelper=opsHelper,
                pilotRef=pilotRef,
            )
            result = matcher.selectJob(resourceDescription, credDict)
        except RuntimeError as rte:
            self.log.error("Error requesting job for pilot", "[%s] %s" % (pilotRef, rte))
            return S_ERROR("Error requesting job")
        except PilotVersionError as pve:
            self.log.warn("Pilot version error for pilot", "[%s] %s" % (pilotRef, pve))
            return S_ERROR(DErrno.EWMSPLTVER, callStack=[])

        # result can be empty, meaning that no job matched
        if result:
            gMonitor.addMark("matchesDone")
            gMonitor.addMark("matchesOK")
            return S_OK(result)
        return S_ERROR(DErrno.EWMSNOMATCH, callStack=[])
Ejemplo n.º 8
0
    def __printSummary(self):
        ''' pretty print summary '''
        res = self.storageUsage.getStorageSummary()
        if res['OK']:
            self.log.notice("Storage Usage Summary")
            self.log.notice(
                "============================================================")
            self.log.notice(
                "%-40s %20s %20s" %
                ('Storage Element', 'Number of files', 'Total size'))

            for se in sorted(res['Value']):
                site = se.split('_')[0].split('-')[0]
                gMonitor.registerActivity("%s-used" % se,
                                          "%s usage" % se,
                                          "StorageUsage/%s usage" % site,
                                          "",
                                          gMonitor.OP_MEAN,
                                          bucketLength=600)
                gMonitor.registerActivity("%s-files" % se,
                                          "%s files" % se,
                                          "StorageUsage/%s files" % site,
                                          "Files",
                                          gMonitor.OP_MEAN,
                                          bucketLength=600)

            time.sleep(2)

            for se in sorted(res['Value']):
                usage = res['Value'][se]['Size']
                files = res['Value'][se]['Files']
                self.log.notice("%-40s %20s %20s" %
                                (se, str(files), str(usage)))
                gMonitor.addMark("%s-used" % se, usage)
                gMonitor.addMark("%s-files" % se, files)
Ejemplo n.º 9
0
  def __checkReplicas(self):
    """ check done replicas and update file states  """
    waitingFiles = dict([(opFile.LFN, opFile) for opFile in self.operation
                         if opFile.Status in ("Waiting", "Scheduled")])
    targetSESet = set(self.operation.targetSEList)

    replicas = self.fc.getReplicas(waitingFiles.keys())
    if not replicas["OK"]:
      self.log.error('Failed to get replicas', replicas["Message"])
      return replicas

    reMissing = re.compile(r".*such file.*")
    for failedLFN, errStr in replicas["Value"]["Failed"].iteritems():
      waitingFiles[failedLFN].Error = errStr
      if reMissing.search(errStr.lower()):
        self.log.error("File does not exists", failedLFN)
        gMonitor.addMark("ReplicateFail", len(targetSESet))
        waitingFiles[failedLFN].Status = "Failed"

    for successfulLFN, reps in replicas["Value"]["Successful"].iteritems():
      if targetSESet.issubset(set(reps)):
        self.log.info("file %s has been replicated to all targets" % successfulLFN)
        waitingFiles[successfulLFN].Status = "Done"

    return S_OK()
Ejemplo n.º 10
0
  def execute(self):
    """ Main execution method. Just fills a list, and a queue, with BKKQueries ID.
    """

    gMonitor.addMark('Iteration', 1)
    # Get all the transformations
    result = self.transClient.getTransformations(condDict={'Status': ['Active', 'Idle']})
    if not result['OK']:
      self._logError("Failed to get transformations.", result['Message'])
      return S_OK()
    transIDsList = [long(transDict['TransformationID']) for transDict in result['Value']]
    res = self.transClient.getTransformationsWithBkQueries(transIDsList)
    if not res['OK']:
      self._logError("Failed to get transformations with Bk Queries.", res['Message'])
      return S_OK()
    transIDsWithBkQueriesList = res['Value']

    _count = 0
    # Process each transformation
    for transID in transIDsWithBkQueriesList:
      if transID in self.bkQueriesInCheck:
        continue
      self.bkQueriesInCheck.append(transID)
      self.bkQueriesToBeChecked.put(transID)
      _count += 1

    self._logInfo("Out of %d transformations, %d put in thread queue" % (len(result['Value']), _count))

    self.__dumpLog()
    return S_OK()
Ejemplo n.º 11
0
    def execute(self):
        """ one cycle execution """

        now = datetime.datetime.now()
        kickTime = now - datetime.timedelta(hours=self.KICK_ASSIGNED_HOURS)
        rmTime = now - datetime.timedelta(days=self.DEL_GRACE_DAYS)

        kicked = 0
        deleted = 0

        # # select Assigned FTSJobs
        assignedFTSJobList = self.ftsClient().getFTSJobList(["Assigned"],
                                                            self.KICK_LIMIT)
        if not assignedFTSJobList["OK"]:
            self.log.error("execute: %s" % assignedFTSJobList["Message"])
            return assignedFTSJobList
        assignedFTSJobList = assignedFTSJobList["Value"]

        for ftsJob in assignedFTSJobList:
            if ftsJob.LastUpdate > kickTime:
                self.log.debug(
                    "FTSJob %s is Assigned for too long and has to be kicked" %
                    ftsJob.FTSGUID)
                kicked += 1
                ftsJob.Status = "Submitted"
            put = self.ftsClient().putFTSJob(ftsJob)
            if not put["OK"]:
                self.log.error("execute: unable to put back FTSJob %s: %s" %
                               (ftsJob.FTSGUID, put["Message"]))
                return put

        finishedFTSJobList = self.ftsClient().getFTSJobList(
            list(FTSJob.FINALSTATES), self.DEL_LIMIT)
        if not finishedFTSJobList["OK"]:
            self.log.error("execute: %s" % finishedFTSJobList["Message"])
            return finishedFTSJobList
        finishedFTSJobList = finishedFTSJobList["Value"]

        for ftsJob in finishedFTSJobList:
            if ftsJob.LastUpdate > rmTime:
                self.log.debug("FTSJob %s is too old and has to be deleted" %
                               ftsJob.FTSGUID)
                delJob = self.ftsClient().deleteFTSJob(ftsJob.FTSJobID)
                if not delJob["OK"]:
                    self.log.error("execute: %s" % delJob["Message"])
                    return delJob
            else:
                putJob = self.ftsClient().putFTSJob(ftsJob)
                if not putJob["OK"]:
                    self.log.error("execute: %s" % putJob["Message"])
                    return putJob

        self.log.info(
            "Assigned FTSJobs kicked %s Finished FTSJobs deleted %s" %
            (kicked, deleted))
        gMonitor.addMark("KickedFTSJobs", kicked)
        gMonitor.addMark("DeletedFTSJobs", deleted)
        return S_OK()
Ejemplo n.º 12
0
 def _endReportToMonitoring(self, initialWallTime, initialCPUTime):
     wallTime = time.time() - initialWallTime
     stats = os.times()
     cpuTime = stats[0] + stats[2] - initialCPUTime
     percentage = 0
     if wallTime:
         percentage = cpuTime / wallTime * 100.
     if percentage > 0:
         gMonitor.addMark('CPU', percentage)
Ejemplo n.º 13
0
 def _endReportToMonitoring( self, initialWallTime, initialCPUTime ):
   wallTime = time.time() - initialWallTime
   stats = os.times()
   cpuTime = stats[0] + stats[2] - initialCPUTime
   percentage = 0
   if wallTime:
     percentage = cpuTime / wallTime * 100.
   if percentage > 0:
     gMonitor.addMark( 'CPU', percentage )
Ejemplo n.º 14
0
  def execute( self ):
    """ one cycle execution """

    # Don't use the server certificate otherwise the DFC wont let us write
    gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' )


    log = gLogger.getSubLogger( "execute" )
    # # reset FTSPlacement if expired
    now = datetime.datetime.now()
    if now > self.__ftsPlacementValidStamp:
      log.info( "resetting expired FTS placement..." )
      resetFTSPlacement = self.resetFTSPlacement()
      if not resetFTSPlacement["OK"]:
        log.error( "FTSPlacement recreation error:" , resetFTSPlacement["Message"] )
        return resetFTSPlacement
      self.__ftsPlacementValidStamp = now + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH )

    requestIDs = self.requestClient().getRequestIDsList( statusList = [ "Scheduled" ], limit = self.MAX_REQUESTS )
    if not requestIDs["OK"]:
      log.error( "unable to read scheduled request ids" , requestIDs["Message"] )
      return requestIDs
    if not requestIDs["Value"]:
      requestIDs = []
    else:
      requestIDs = [ req[0] for req in requestIDs["Value"] if req[0] not in self.__reqCache ]
    requestIDs += self.__reqCache.keys()

    if not requestIDs:
      log.info( "no 'Scheduled' requests to process" )
      return S_OK()

    log.info( "found %s requests to process:" % len( requestIDs ) )
    log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) )
    log.info( " =>   new read from RMS: %s" % ( len( requestIDs ) - len( self.__reqCache ) ) )

    for requestID in requestIDs:
      request = self.getRequest( requestID )
      if not request["OK"]:
        log.error( "Error getting request", "%s: %s" % ( requestID, request["Message"] ) )
        continue
      request = request["Value"]
      sTJId = request.RequestID
      while True:
        queue = self.threadPool().generateJobAndQueueIt( self.processRequest,
                                                         args = ( request, ),
                                                         sTJId = sTJId )
        if queue["OK"]:
          log.info( "Request enqueued for execution", sTJId )
          gMonitor.addMark( "RequestsAtt", 1 )
          break
        time.sleep( 1 )

    # # process all results
    self.threadPool().processAllResults()
    return S_OK()
Ejemplo n.º 15
0
  def __monitorJob( self, request, ftsJob ):
    """ execute FTSJob.monitorFTS for a given :ftsJob:
        if ftsJob is in a final state, finalize it

    :param Request request: ReqDB.Request instance
    :param FTSJob ftsJob: FTSDB.FTSJob instance
    """
    log = self.log.getSubLogger( "req_%s/%s/monitor/%s" % ( request.RequestID, request.RequestName, ftsJob.FTSGUID ) )
    log.info( "FTSJob '%s'@'%s'" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) )

    # # this will be returned
    ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) )

    monitor = ftsJob.monitorFTS( self.__ftsVersion , command = self.MONITOR_COMMAND )
    if not monitor["OK"]:
      gMonitor.addMark( "FTSMonitorFail", 1 )
      log.error( monitor["Message"] )
      if "getTransferJobSummary2: Not authorised to query request" in monitor["Message"] or \
         'was not found' in monitor['Message'] or\
         "Not found" in monitor['Message'] or\
         'Unknown transfer state' in monitor['Message']:
        log.error( "FTSJob not known (expired on server?): delete it" )
        for ftsFile in ftsJob:
          ftsFile.Status = "Waiting"
          ftsFilesDict["toSubmit"].append( ftsFile )
        # #  No way further for that job: delete it
        res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID )
        if not res['OK']:
          log.error( "Unable to delete FTSJob", res['Message'] )
        return S_OK( ftsFilesDict )
      return monitor

    monitor = monitor["Value"]
    log.info( "FTSJob Status = %s Completeness = %s%%" % ( ftsJob.Status, ftsJob.Completeness ) )

    # # monitor status change
    gMonitor.addMark( "FTSJobs%s" % ftsJob.Status, 1 )

    if ftsJob.Status in FTSJob.FINALSTATES:
      finalizeFTSJob = self.__finalizeFTSJob( request, ftsJob )
      if not finalizeFTSJob["OK"]:
        if 'Unknown transfer state' in finalizeFTSJob['Message']:
          for ftsFile in ftsJob:
            ftsFile.Status = "Waiting"
            ftsFilesDict["toSubmit"].append( ftsFile )
          # #  No way further for that job: delete it
          res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID )
          if not res['OK']:
            log.error( "Unable to delete FTSJob", res['Message'] )
        else:
          log.error( finalizeFTSJob["Message"] )
          return finalizeFTSJob
      else:
        ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, finalizeFTSJob["Value"] )

    return S_OK( ftsFilesDict )
Ejemplo n.º 16
0
  def execute( self ):
    """ one cycle execution """

    # Don't use the server certificate otherwise the DFC wont let us write
    gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' )


    log = gLogger.getSubLogger( "execute" )
    # # reset FTSPlacement if expired
    now = datetime.datetime.now()
    if now > self.__ftsPlacementValidStamp:
      log.info( "resetting expired FTS placement..." )
      resetFTSPlacement = self.resetFTSPlacement()
      if not resetFTSPlacement["OK"]:
        log.error( "FTSPlacement recreation error:" , resetFTSPlacement["Message"] )
        return resetFTSPlacement
      self.__ftsPlacementValidStamp = now + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH )

    requestIDs = self.requestClient().getRequestIDsList( statusList = [ "Scheduled" ], limit = self.MAX_REQUESTS )
    if not requestIDs["OK"]:
      log.error( "unable to read scheduled request ids" , requestIDs["Message"] )
      return requestIDs
    if not requestIDs["Value"]:
      requestIDs = []
    else:
      requestIDs = [ req[0] for req in requestIDs["Value"] if req[0] not in self.__reqCache ]
    requestIDs += self.__reqCache.keys()

    if not requestIDs:
      log.info( "no 'Scheduled' requests to process" )
      return S_OK()

    log.info( "found %s requests to process:" % len( requestIDs ) )
    log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) )
    log.info( " =>   new read from RMS: %s" % ( len( requestIDs ) - len( self.__reqCache ) ) )

    for requestID in requestIDs:
      request = self.getRequest( requestID )
      if not request["OK"]:
        log.error( "Error getting request", "%s: %s" % ( requestID, request["Message"] ) )
        continue
      request = request["Value"]
      sTJId = request.RequestID
      while True:
        queue = self.threadPool().generateJobAndQueueIt( self.processRequest,
                                                         args = ( request, ),
                                                         sTJId = sTJId )
        if queue["OK"]:
          log.info( "Request enqueued for execution", sTJId )
          gMonitor.addMark( "RequestsAtt", 1 )
          break
        time.sleep( 1 )

    # # process all results
    self.threadPool().processAllResults()
    return S_OK()
Ejemplo n.º 17
0
  def __monitorJob( self, request, ftsJob ):
    """ execute FTSJob.monitorFTS for a given :ftsJob:
        if ftsJob is in a final state, finalize it

    :param Request request: ReqDB.Request instance
    :param FTSJob ftsJob: FTSDB.FTSJob instance
    """
    log = self.log.getSubLogger( "req_%s/%s/monitor/%s" % ( request.RequestID, request.RequestName, ftsJob.FTSGUID ) )
    log.info( "FTSJob '%s'@'%s'" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) )

    # # this will be returned
    ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) )

    monitor = ftsJob.monitorFTS( self.__ftsVersion , command = self.MONITOR_COMMAND )
    if not monitor["OK"]:
      gMonitor.addMark( "FTSMonitorFail", 1 )
      log.error( monitor["Message"] )
      if "getTransferJobSummary2: Not authorised to query request" in monitor["Message"] or \
         'was not found' in monitor['Message'] or\
         "Not found" in monitor['Message'] or\
         'Unknown transfer state' in monitor['Message']:
        log.error( "FTSJob not known (expired on server?): delete it" )
        for ftsFile in ftsJob:
          ftsFile.Status = "Waiting"
          ftsFilesDict["toSubmit"].append( ftsFile )
        # #  No way further for that job: delete it
        res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID )
        if not res['OK']:
          log.error( "Unable to delete FTSJob", res['Message'] )
        return S_OK( ftsFilesDict )
      return monitor

    monitor = monitor["Value"]
    log.info( "FTSJob Status = %s Completeness = %s%%" % ( ftsJob.Status, ftsJob.Completeness ) )

    # # monitor status change
    gMonitor.addMark( "FTSJobs%s" % ftsJob.Status, 1 )

    if ftsJob.Status in FTSJob.FINALSTATES:
      finalizeFTSJob = self.__finalizeFTSJob( request, ftsJob )
      if not finalizeFTSJob["OK"]:
        if 'Unknown transfer state' in finalizeFTSJob['Message']:
          for ftsFile in ftsJob:
            ftsFile.Status = "Waiting"
            ftsFilesDict["toSubmit"].append( ftsFile )
          # #  No way further for that job: delete it
          res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID )
          if not res['OK']:
            log.error( "Unable to delete FTSJob", res['Message'] )
        else:
          log.error( finalizeFTSJob["Message"] )
          return finalizeFTSJob
      else:
        ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, finalizeFTSJob["Value"] )

    return S_OK( ftsFilesDict )
Ejemplo n.º 18
0
    def export_requestJob(self, resourceDescription):
        """ Serve a job to the request of an agent which is the highest priority
        one matching the agent's site capacity
    """

        result = self.selectJob(resourceDescription)
        gMonitor.addMark("matchesDone")
        if result['OK']:
            gMonitor.addMark("matchesOK")
        return result
Ejemplo n.º 19
0
  def export_requestJob( self, resourceDescription ):
    """ Serve a job to the request of an agent which is the highest priority
        one matching the agent's site capacity
    """

    result = self.selectJob( resourceDescription )
    gMonitor.addMark( "matchesDone" )
    if result[ 'OK' ]:
      gMonitor.addMark( "matchesOK" )
    return result
Ejemplo n.º 20
0
  def execute( self ):
    """ one cycle execution """

    now = datetime.datetime.now()
    kickTime = now - datetime.timedelta( hours = self.KICK_ASSIGNED_HOURS )
    rmTime = now - datetime.timedelta( days = self.DEL_GRACE_DAYS )

    kicked = 0
    deleted = 0

    # # select Assigned FTSJobs
    assignedFTSJobList = self.ftsClient().getFTSJobList( ["Assigned"], self.KICK_LIMIT )
    if not assignedFTSJobList["OK"]:
      self.log.error( "execute: %s" % assignedFTSJobList["Message"] )
      return assignedFTSJobList
    assignedFTSJobList = assignedFTSJobList["Value"]

    for ftsJob in assignedFTSJobList:
      if ftsJob.LastUpdate > kickTime:
        self.log.debug( "FTSJob %s is Assigned for too long and has to be kicked" % ftsJob.FTSGUID )
        kicked += 1
        ftsJob.Status = "Submitted"
      put = self.ftsClient().putFTSJob( ftsJob )
      if not put["OK"]:
        self.log.error( "execute: unable to put back FTSJob %s: %s" % ( ftsJob.FTSGUID, put["Message"] ) )
        return put

    finishedFTSJobList = self.ftsClient().getFTSJobList( list( FTSJob.FINALSTATES ), self.DEL_LIMIT )
    if not finishedFTSJobList["OK"]:
      self.log.error( "execute: %s" % finishedFTSJobList["Message"] )
      return finishedFTSJobList
    finishedFTSJobList = finishedFTSJobList["Value"]

    for ftsJob in finishedFTSJobList:
      if ftsJob.LastUpdate > rmTime:
        self.log.debug( "FTSJob %s is too old and has to be deleted" % ftsJob.FTSGUID )
        delJob = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID )
        if not delJob["OK"]:
          self.log.error( "execute: %s" % delJob["Message"] )
          return delJob
      else:
        putJob = self.ftsClient().putFTSJob( ftsJob )
        if not putJob["OK"]:
          self.log.error( "execute: %s" % putJob["Message"] )
          return putJob

    self.log.info( "Assigned FTSJobs kicked %s Finished FTSJobs deleted %s" % ( kicked, deleted ) )
    gMonitor.addMark( "KickedFTSJobs", kicked )
    gMonitor.addMark( "DeletedFTSJobs", deleted )
    return S_OK()
Ejemplo n.º 21
0
  def registerCopiedFiles(self, filesNewlyCopied, copiedFiles, allUnmigratedFilesMeta):
    """
      Register successfuly copied files (newly, or in Copied status in the DB) in the DFC.

      :param filesNewlyCopied: [lfns] of files newly copied
      :param copiedFiles: {lfn:RIDb metadata} of files that were in Copied state.
      :param allUnmigratedFilesMeta: {lfn:RI Db metadata} for all lfns non migrated at
                                    the beginning of the loop.

      :return: {lfn:True} for successfuly registered lfns
    """
    if filesNewlyCopied or copiedFiles:
      self.log.info("Attempting to register %s newly copied and %s previously copied files" %
                    (len(filesNewlyCopied), len(copiedFiles)))
    else:
      self.log.info("No files to be registered")

    # Update copiedFiles to also contain the newly copied files
    copiedFiles.update(dict((lfn, allUnmigratedFilesMeta[lfn]) for lfn in filesNewlyCopied))

    successfulRegister = {}
    failedRegister = {}

    # Try to register them by batch
    for lfnChunk in breakListIntoChunks(copiedFiles, 100):
      # Add the metadata
      lfnDictChuck = dict((lfn, copiedFiles[lfn]) for lfn in lfnChunk)
      res = self.fileCatalog.addFile(lfnDictChuck)

      if not res['OK']:
        self.log.error("Completely failed to register some successfully copied file.",
                       res['Message'])
        failedRegister.update(dict((lfn, res['Message']) for lfn in lfnDictChuck))
      else:
        successfulRegister.update(res['Value']['Successful'])
        failedRegister.update(res['Value']['Failed'])

    gMonitor.addMark("ErrorRegister", len(failedRegister))
    for lfn, reason in failedRegister.iteritems():
      self.log.error("Failed to register lfn. Setting to Copied", "%s: %s" % (lfn, reason))
      res = self.rawIntegrityDB.setFileStatus(lfn, 'Copied')
      if not res['OK']:
        self.log.error("Error setting file status to Copied", "%s: %s" % (lfn, res['Message']))

    for lfn in successfulRegister:
      self.log.info("Successfully registered %s in the File Catalog." % lfn)

    return successfulRegister
Ejemplo n.º 22
0
    def export_putRequest(self, requestJSON):
        """forward request from local RequestDB to central RequestManager

        :param self: self reference
        :param str requestType: request type
        """

        gMonitor.addMark("reqReceived", 1)

        requestDict = json.loads(requestJSON)
        requestName = requestDict.get(
            "RequestID", requestDict.get("RequestName", "***UNKNOWN***"))
        gLogger.info("putRequest: got request '%s'" % requestName)

        # We only need the object to check the authorization
        request = Request(requestDict)
        # Check whether the credentials in the Requests are correct and allowed to be set
        isAuthorized = RequestValidator.setAndCheckRequestOwner(
            request, self.getRemoteCredentials())

        if not isAuthorized:
            return S_ERROR(DErrno.ENOAUTH,
                           "Credentials in the requests are not allowed")

        forwardable = self.__forwardable(requestDict)
        if not forwardable["OK"]:
            gLogger.warn("putRequest: %s" % forwardable["Message"])

        setRequest = self.requestManager().putRequest(requestJSON)
        if not setRequest["OK"]:
            gLogger.error(
                "setReqeuest: unable to set request '%s' @ RequestManager: %s"
                % (requestName, setRequest["Message"]))
            # # put request to the request file cache
            save = self.__saveRequest(requestName, requestJSON)
            if not save["OK"]:
                gLogger.error(
                    "setRequest: unable to save request to the cache: %s" %
                    save["Message"])
                return save
            gLogger.info("setRequest: %s is saved to %s file" %
                         (requestName, save["Value"]))
            return S_OK({"set": False, "saved": True})

        gLogger.info(
            "setRequest: request '%s' has been set to the ReqManager" %
            (requestName))
        return S_OK({"set": True, "saved": False})
Ejemplo n.º 23
0
 def __getGraph( self, plotFunc, args ):
   fromSecs = args[0]
   toSecs = args[1]
   graphFile = "%s-%s-%s.png" % ( self.__generateName( *args[2:] ),
                                  self.rrdManager.bucketize( fromSecs, self.graceTime ),
                                  self.rrdManager.bucketize( toSecs, self.graceTime )
                               )
   if self.__isCacheGraph( graphFile ):
     self.__refreshGraph( graphFile )
     gLogger.info( "Cached graph file %s" % graphFile )
     gMonitor.addMark( "cachedplots" )
     return S_OK( graphFile )
   else:
     gMonitor.addMark( "drawnplots" )
     self.__registerGraph( graphFile, fromSecs, toSecs )
     return plotFunc( graphFilename = graphFile, *args )
Ejemplo n.º 24
0
 def __getGraph(self, plotFunc, args):
     fromSecs = args[0]
     toSecs = args[1]
     graphFile = "%s-%s-%s.png" % (self.__generateName(
         *args[2:]), self.rrdManager.bucketize(fromSecs, self.graceTime),
                                   self.rrdManager.bucketize(
                                       toSecs, self.graceTime))
     if self.__isCacheGraph(graphFile):
         self.__refreshGraph(graphFile)
         gLogger.info("Cached graph file %s" % graphFile)
         gMonitor.addMark("cachedplots")
         return S_OK(graphFile)
     else:
         gMonitor.addMark("drawnplots")
         self.__registerGraph(graphFile, fromSecs, toSecs)
         return plotFunc(graphFilename=graphFile, *args)
Ejemplo n.º 25
0
 def _startReportToMonitoring( self ):
   try:
     now = time.time()
     stats = os.times()
     cpuTime = stats[0] + stats[2]
     if now - self.__monitorLastStatsUpdate < 10:
       return ( now, cpuTime )
     # Send CPU consumption mark
     self.__monitorLastStatsUpdate = now
     # Send Memory consumption mark
     membytes = MemStat.VmB( 'VmRSS:' )
     if membytes:
       mem = membytes / ( 1024. * 1024. )
       gMonitor.addMark( 'MEM', mem )
     return( now, cpuTime )
   except Exception:
     return False
Ejemplo n.º 26
0
 def _startReportToMonitoring(self):
     try:
         now = time.time()
         stats = os.times()
         cpuTime = stats[0] + stats[2]
         if now - self.__monitorLastStatsUpdate < 10:
             return (now, cpuTime)
         # Send CPU consumption mark
         self.__monitorLastStatsUpdate = now
         # Send Memory consumption mark
         membytes = MemStat.VmB('VmRSS:')
         if membytes:
             mem = membytes / (1024. * 1024.)
             gMonitor.addMark('MEM', mem)
         return (now, cpuTime)
     except Exception:
         return False
Ejemplo n.º 27
0
 def export_getReport( self, reportRequest ):
   """
   Plot a accounting
   Arguments:
     - viewName : Name of view (easy!)
     - startTime
     - endTime
     - argsDict : Arguments to the view.
     - grouping
     - extraArgs
   """
   retVal = self.__checkPlotRequest( reportRequest )
   if not retVal[ 'OK' ]:
     return retVal
   reporter = MainReporter( self.__acDB, self.serviceInfoDict[ 'clientSetup' ] )
   gMonitor.addMark( "reportsRequested" )
   reportRequest[ 'generatePlot' ] = False
   return reporter.generate( reportRequest, self.getRemoteCredentials() )
Ejemplo n.º 28
0
 def export_getReport( self, reportRequest ):
   """
   Plot a accounting
   Arguments:
     - viewName : Name of view (easy!)
     - startTime
     - endTime
     - argsDict : Arguments to the view.
     - grouping
     - extraArgs
   """
   retVal = self.__checkPlotRequest( reportRequest )
   if not retVal[ 'OK' ]:
     return retVal
   reporter = MainReporter( self.__acDB, self.serviceInfoDict[ 'clientSetup' ] )
   gMonitor.addMark( "reportsRequested" )
   reportRequest[ 'generatePlot' ] = False
   return reporter.generate( reportRequest, self.getRemoteCredentials() )
Ejemplo n.º 29
0
  def dmRemoval(self, toRemoveDict, targetSEs ):

    gMonitor.addMark( "RemoveReplicaAtt", len( toRemoveDict ) * len( targetSEs ) )
    # # keep status for each targetSE
    removalStatus = dict.fromkeys( toRemoveDict.keys(), None )
    for lfn in removalStatus:
      removalStatus[lfn] = dict.fromkeys( targetSEs, None )

    # # loop over targetSEs
    for targetSE in targetSEs:
      self.log.info( "removing replicas at %s" % targetSE )

      # # 1st step - bulk removal
      bulkRemoval = self.bulkRemoval( toRemoveDict, targetSE )
      if not bulkRemoval["OK"]:
        self.log.error( 'Bulk replica removal failed', bulkRemoval["Message"] )
        return bulkRemoval
      bulkRemoval = bulkRemoval["Value"]

      # # update removal status for successful files
      removalOK = [ opFile for opFile in bulkRemoval.values() if not opFile.Error ]

      for opFile in removalOK:
        removalStatus[opFile.LFN][targetSE] = ""
      gMonitor.addMark( "RemoveReplicaOK", len( removalOK ) )

      # # 2nd step - process the rest again
      toRetry = dict( [ ( lfn, opFile ) for lfn, opFile in bulkRemoval.items() if opFile.Error ] )
      for lfn, opFile in toRetry.items():
        self.singleRemoval( opFile, targetSE )
        if not opFile.Error:
          gMonitor.addMark( "RemoveReplicaOK", 1 )
          removalStatus[lfn][targetSE] = ""
        else:
          gMonitor.addMark( "RemoveReplicaFail", 1 )
          removalStatus[lfn][targetSE] = opFile.Error

    # # update file status for waiting files
    failed = 0
    for opFile in self.operation:
      if opFile.Status == "Waiting":
        errors = list( set( [ error for error in removalStatus[lfn].values() if error ] ) )
        if errors:
          opFile.Error = ",".join( errors )
          # This seems to be the only offending error
          if "Write access not permitted for this credential" in opFile.Error:
            failed += 1
            continue
        opFile.Status = "Done"

    if failed:
      self.operation.Error = "failed to remove %s replicas" % failed

    return S_OK(removalStatus)
Ejemplo n.º 30
0
    def __checkReplicas(self):
        """ check done replicas and update file states  """
        waitingFiles = dict([(opFile.LFN, opFile) for opFile in self.operation
                             if opFile.Status in ("Waiting", "Scheduled")])
        targetSESet = set(self.operation.targetSEList)

        # Check replicas
        res = self.ci._getCatalogReplicas(list(waitingFiles))

        if not res["OK"]:
            self.log.error('Failed to get catalog replicas', res["Message"])
            return S_ERROR()

        allReplicas = res['Value'][0]

        replicas = self.ci.compareChecksum(list(waitingFiles))

        if not replicas["OK"]:
            self.log.error('Failed to check replicas', replicas["Message"])
            return S_ERROR()

        replicas = replicas["Value"]
        noReplicas = replicas['NoReplicas']

        if noReplicas:
            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord(
                    self.createRMSRecord("Failed", len(noReplicas)))
                self.rmsMonitoringReporter.commit()
            for lfn in noReplicas.keys():
                self.log.error("File %s doesn't exist" % lfn)
                if not self.rmsMonitoring:
                    gMonitor.addMark("ReplicateFail", len(targetSESet))
                waitingFiles[lfn].Status = "Failed"

        for lfn, reps in allReplicas.items():
            if targetSESet.issubset(set(reps)):
                self.log.info("file %s has been replicated to all targets" %
                              lfn)
                waitingFiles[lfn].Status = "Done"

        return S_OK()
Ejemplo n.º 31
0
 def _startReportToMonitoring(self):
     try:
         if not self.activityMonitoring:
             now = time.time()
             stats = os.times()
             cpuTime = stats[0] + stats[2]
             if now - self.__monitorLastStatsUpdate < 10:
                 return (now, cpuTime)
             # Send CPU consumption mark
             self.__monitorLastStatsUpdate = now
             # Send Memory consumption mark
             membytes = MemStat.VmB("VmRSS:")
             if membytes:
                 mem = membytes / (1024.0 * 1024.0)
                 gMonitor.addMark("MEM", mem)
             return (now, cpuTime)
         else:
             return False
     except Exception:
         return False
Ejemplo n.º 32
0
  def export_putRequest(self, requestJSON):
    """ forward request from local RequestDB to central RequestManager

    :param self: self reference
    :param str requestType: request type
    """

    gMonitor.addMark('reqReceived', 1)

    requestDict = json.loads(requestJSON)
    requestName = requestDict.get("RequestID", requestDict.get('RequestName', "***UNKNOWN***"))
    gLogger.info("putRequest: got request '%s'" % requestName)

    # We only need the object to check the authorization
    request = Request(requestDict)
    # Check whether the credentials in the Requests are correct and allowed to be set
    isAuthorized = RequestValidator.setAndCheckRequestOwner(request, self.getRemoteCredentials())

    if not isAuthorized:
      return S_ERROR(DErrno.ENOAUTH, "Credentials in the requests are not allowed")

    forwardable = self.__forwardable(requestDict)
    if not forwardable["OK"]:
      gLogger.warn("putRequest: %s" % forwardable["Message"])

    setRequest = self.requestManager().putRequest(requestJSON)
    if not setRequest["OK"]:
      gLogger.error(
          "setReqeuest: unable to set request '%s' @ RequestManager: %s" %
          (requestName, setRequest["Message"]))
      # # put request to the request file cache
      save = self.__saveRequest(requestName, requestJSON)
      if not save["OK"]:
        gLogger.error("setRequest: unable to save request to the cache: %s" % save["Message"])
        return save
      gLogger.info("setRequest: %s is saved to %s file" % (requestName, save["Value"]))
      return S_OK({"set": False, "saved": True})

    gLogger.info("setRequest: request '%s' has been set to the ReqManager" % (requestName))
    return S_OK({"set": True, "saved": False})
Ejemplo n.º 33
0
    def export_putRequest(self, requestJSON):
        """ forward request from local RequestDB to central RequestManager

    :param self: self reference
    :param str requestType: request type
    """

        gMonitor.addMark('reqReceived', 1)

        requestDict = json.loads(requestJSON)
        requestName = requestDict.get(
            "RequestID", requestDict.get('RequestName', "***UNKNOWN***"))
        gLogger.info("putRequest: got request '%s'" % requestName)

        forwardable = self.__forwardable(requestDict)
        if not forwardable["OK"]:
            gLogger.warn("putRequest: %s" % forwardable["Message"])

        setRequest = self.requestManager().putRequest(requestJSON)
        if not setRequest["OK"]:
            gLogger.error(
                "setReqeuest: unable to set request '%s' @ RequestManager: %s"
                % (requestName, setRequest["Message"]))
            # # put request to the request file cache
            save = self.__saveRequest(requestName, requestJSON)
            if not save["OK"]:
                gLogger.error(
                    "setRequest: unable to save request to the cache: %s" %
                    save["Message"])
                return save
            gLogger.info("setRequest: %s is saved to %s file" %
                         (requestName, save["Value"]))
            return S_OK({"set": False, "saved": True})

        gLogger.info(
            "setRequest: request '%s' has been set to the ReqManager" %
            (requestName))
        return S_OK({"set": True, "saved": False})
Ejemplo n.º 34
0
    def __call__(self):
        """ call me maybe """
        # # counter for failed files
        failedFiles = 0
        # # catalog to use
        catalog = self.operation.Catalog
        dm = DataManager(catalogs=catalog)
        # # get waiting files
        waitingFiles = self.getWaitingFilesList()
        # # loop over files
        for opFile in waitingFiles:

            gMonitor.addMark("RegisterAtt", 1)

            # # get LFN
            lfn = opFile.LFN
            # # and others
            fileTuple = (lfn, opFile.PFN, opFile.Size,
                         self.operation.targetSEList[0], opFile.GUID,
                         opFile.Checksum)
            # # call DataManager
            registerFile = dm.registerFile(fileTuple)
            # # check results
            if not registerFile["OK"] or lfn in registerFile["Value"]["Failed"]:

                gMonitor.addMark("RegisterFail", 1)
                self.dataLoggingClient().addFileRecord(lfn, "RegisterFail",
                                                       catalog, "",
                                                       "RegisterFile")

                reason = registerFile.get(
                    "Message",
                    registerFile.get("Value", {}).get("Failed",
                                                      {}).get(lfn, 'Unknown'))
                errorStr = "failed to register LFN %s: %s" % (lfn, reason)
                opFile.Error = errorStr
                self.log.warn(errorStr)
                failedFiles += 1

            else:

                gMonitor.addMark("RegisterOK", 1)
                self.dataLoggingClient().addFileRecord(lfn, "Register",
                                                       catalog, "",
                                                       "RegisterFile")

                self.log.info("file %s has been registered at %s" %
                              (lfn, catalog))
                opFile.Status = "Done"

        # # final check
        if failedFiles:
            self.log.info("all files processed, %s files failed to register" %
                          failedFiles)
            self.operation.Error = "some files failed to register"
            return S_ERROR(self.operation.Error)

        return S_OK()
Ejemplo n.º 35
0
  def removeRegisteredFiles(self, filesNewlyRegistered, registeredFiles, allUnmigratedFilesMeta):
    """
      Remove successfuly registered files (newly, or in Registered status in the DB)
      from the OnlineStorage

      :param filesNewlyCopied: [lfns] of files newly copied
      :param copiedFiles: {lfn:RIDb metadata} of files that were in Copied state.
      :param allUnmigratedFilesMeta: {lfn:RI Db metadata} for all lfns non migrated at
                                    the beginning of the loop.

      :return: {lfn:True} for successfuly registered lfns
    """
    if filesNewlyRegistered or registeredFiles:
      self.log.info("Attempting to remove %s newly registered and %s previously registered files" %
                    (len(filesNewlyRegistered), len(registeredFiles)))
    else:
      self.log.info("No files to be removed")

    # Update registeredFiles to also contain the newly registered files
    registeredFiles.update(dict((lfn, allUnmigratedFilesMeta[lfn]) for lfn in filesNewlyRegistered))

    onlineSE = StorageElement('OnlineRunDB')

    # Try to them them all
    res = onlineSE.removeFile(registeredFiles)

    filesNewlyRemoved = {}
    failedRemove = {}
    if not res['OK']:
      self.log.error("Completely failed to remove successfully registered files.", res['Message'])
      failedRemove = dict((lfn, res['Message']) for lfn in registeredFiles)
    else:
      filesNewlyRemoved = res['Value']['Successful']
      failedRemove = res['Value']['Failed']

    gMonitor.addMark("ErrorRemove", len(failedRemove))
    for lfn, reason in failedRemove.iteritems():
      self.log.error("Failed to remove lfn. Setting to Registered", "%s: %s" % (lfn, reason))
      res = self.rawIntegrityDB.setFileStatus(lfn, 'Registered')
      if not res['OK']:
        self.log.error("Error setting file status to Registered", "%s: %s" % (lfn, res['Message']))

    now = datetime.datetime.utcnow()
    for lfn in filesNewlyRemoved:
      self.log.info("Successfully removed %s from the Online storage. Setting it to Done" % lfn)
      res = self.rawIntegrityDB.setFileStatus(lfn, 'Done')
      if not res['OK']:
        self.log.error("Error setting file status to Done", "%s: %s" % (lfn, res['Message']))
      else:
        # SubmitTime is ALREADY a datetime since it is declared as such in the DB.
        submitTime = allUnmigratedFilesMeta[lfn]['SubmitTime']
        migrationTime = (now - submitTime).total_seconds()
        gMonitor.addMark("MigrationTime", migrationTime)
        fileSizeMB = allUnmigratedFilesMeta[lfn]['Size'] / (1024 * 1024.0)
        gMonitor.addMark("MigrationRate", fileSizeMB / migrationTime)

    return filesNewlyRemoved
Ejemplo n.º 36
0
  def export_removeReplica(self, lfns):
    """ Remove the supplied replicas """
    gMonitor.addMark("RemoveReplica", 1)
    res = self.fileCatalogDB.removeReplica(lfns, self.getRemoteCredentials())
    if res['OK']:
      gMonitor.addMark("RemoveReplicaSuccessful", len(res.get('Value', {}).get('Successful', [])))
      gMonitor.addMark("RemoveReplicaFailed", len(res.get('Value', {}).get('Failed', [])))

    return res
Ejemplo n.º 37
0
  def export_addFile( self, lfns ):
    """ Register supplied files """
    gMonitor.addMark( "AddFile", 1 )
    res = gFileCatalogDB.addFile( lfns, self.getRemoteCredentials() )
    if res['OK']:
      gMonitor.addMark( "AddFileSuccessful", len( res.get( 'Value', {} ).get( 'Successful', [] ) ) )
      gMonitor.addMark( "AddFileFailed", len( res.get( 'Value', {} ).get( 'Failed', [] ) ) )

    return res
Ejemplo n.º 38
0
  def export_addFile(self, lfns):
    """ Register supplied files """
    gMonitor.addMark("AddFile", 1)
    res = self.fileCatalogDB.addFile(lfns, self.getRemoteCredentials())
    if res['OK']:
      gMonitor.addMark("AddFileSuccessful", len(res.get('Value', {}).get('Successful', [])))
      gMonitor.addMark("AddFileFailed", len(res.get('Value', {}).get('Failed', [])))

    return res
Ejemplo n.º 39
0
  def export_removeReplica( self, lfns ):
    """ Remove the supplied replicas """
    gMonitor.addMark( "RemoveReplica", 1 )
    res = gFileCatalogDB.removeReplica( lfns, self.getRemoteCredentials() )
    if res['OK']:
      gMonitor.addMark( "RemoveReplicaSuccessful", len( res.get( 'Value', {} ).get( 'Successful', [] ) ) )
      gMonitor.addMark( "RemoveReplicaFailed", len( res.get( 'Value', {} ).get( 'Failed', [] ) ) )

    return res
Ejemplo n.º 40
0
  def __call__( self ):
    """ call me maybe """
    # # counter for failed files
    failedFiles = 0
    # # catalog(s) to use
    catalogs = self.operation.Catalog
    if catalogs:
      catalogs = [ cat.strip() for cat in catalogs.split( ',' ) ]
    dm = DataManager( catalogs = catalogs )
    # # get waiting files
    waitingFiles = self.getWaitingFilesList()
    # # loop over files
    for opFile in waitingFiles:

      gMonitor.addMark( "RegisterAtt", 1 )

      # # get LFN
      lfn = opFile.LFN
      # # and others
      fileTuple = ( lfn , opFile.PFN, opFile.Size, self.operation.targetSEList[0], opFile.GUID, opFile.Checksum )
      # # call DataManager
      registerFile = dm.registerFile( fileTuple )
      # # check results
      if not registerFile["OK"] or lfn in registerFile["Value"]["Failed"]:

        gMonitor.addMark( "RegisterFail", 1 )
#        self.dataLoggingClient().addFileRecord( lfn, "RegisterFail", ','.join( catalogs ) if catalogs else "all catalogs", "", "RegisterFile" )

        reason = str( registerFile.get( "Message", registerFile.get( "Value", {} ).get( "Failed", {} ).get( lfn, 'Unknown' ) ) )
        errorStr = "failed to register LFN"
        opFile.Error = "%s: %s" % ( errorStr, reason )
        if 'GUID already registered' in reason:
          opFile.Status = 'Failed'
          self.log.error( errorStr, "%s: %s" % ( lfn, reason ) )
        elif 'File already registered with no replicas' in reason:
          self.log.warn( errorStr, "%s: %s, will remove it and retry" % ( lfn, reason ) )
          dm.removeFile( lfn )
        else:
          self.log.warn( errorStr, "%s: %s" % ( lfn, reason ) )
        failedFiles += 1

      else:

        gMonitor.addMark( "RegisterOK", 1 )
#        self.dataLoggingClient().addFileRecord( lfn, "Register", ','.join( catalogs ) if catalogs else "all catalogs", "", "RegisterFile" )

        self.log.verbose( "file %s has been registered at %s" % ( lfn, ','.join( catalogs ) if catalogs else "all catalogs" ) )
        opFile.Status = "Done"

    # # final check
    if failedFiles:
      self.log.warn( "all files processed, %s files failed to register" % failedFiles )
      self.operation.Error = "some files failed to register"
      return S_ERROR( self.operation.Error )

    return S_OK()
Ejemplo n.º 41
0
  def __checkReplicas( self ):
    """ check done replicas and update file states  """
    waitingFiles = dict( [ ( opFile.LFN, opFile ) for opFile in self.operation
                          if opFile.Status in ( "Waiting", "Scheduled" ) ] )
    targetSESet = set( self.operation.targetSEList )

     # Check replicas
    res = self.ci._getCatalogReplicas( waitingFiles.keys() )

    if not res["OK"]:
      self.log.error( 'Failed to get catalog replicas', res["Message"] )
      return S_ERROR()

    allReplicas = res['Value'][0]

    replicas = self.ci.compareChecksum( waitingFiles.keys() )

    if not replicas["OK"]:
      self.log.error( 'Failed to check replicas', replicas["Message"] )
      return S_ERROR()

    replicas = replicas["Value"]
    noReplicas = replicas['NoReplicas']

    if noReplicas:
      for lfn in noReplicas.keys():
        self.log.error( "File %s doesn't exist" % lfn )
        gMonitor.addMark( "ReplicateFail", len( targetSESet ) )
        waitingFiles[lfn].Status = "Failed"

    for lfn, reps in allReplicas.items():
      if targetSESet.issubset( set( reps ) ):
        self.log.info( "file %s has been replicated to all targets" % lfn )
        waitingFiles[lfn].Status = "Done"

    return S_OK()
Ejemplo n.º 42
0
 def __call__(self):
   """Process the ArchiveFiles operation."""
   try:
     gMonitor.addMark('ArchiveFilesAtt', 1)
     self._run()
     gMonitor.addMark('ArchiveFilesOK', 1)
   except RuntimeError as e:
     self.log.info('Failed to execute ArchiveFiles', repr(e))
     gMonitor.addMark('ArchiveFilesFail', 1)
     return S_ERROR(str(e))
   except Exception as e:
     self.log.exception('Failed to execute ArchiveFiles', repr(e), lException=e)
     gMonitor.addMark('ArchiveFilesFail', 1)
     return S_ERROR(str(e))
   finally:
     self._cleanup()
   return S_OK()
Ejemplo n.º 43
0
  def __call__( self ):
    """ call me maybe """
    # # counter for failed files
    failedFiles = 0
    # # catalog(s) to use
    catalogs = self.operation.Catalog
    if catalogs:
      catalogs = [ cat.strip() for cat in catalogs.split( ',' ) ]
    dm = DataManager( catalogs = catalogs )
    # # get waiting files
    waitingFiles = self.getWaitingFilesList()
    # # loop over files
    for opFile in waitingFiles:

      gMonitor.addMark( "RegisterAtt", 1 )

      # # get LFN
      lfn = opFile.LFN
      # # and others
      fileTuple = ( lfn , opFile.PFN, opFile.Size, self.operation.targetSEList[0], opFile.GUID, opFile.Checksum )
      # # call DataManager
      registerFile = dm.registerFile( fileTuple )
      # # check results
      if not registerFile["OK"] or lfn in registerFile["Value"]["Failed"]:

        gMonitor.addMark( "RegisterFail", 1 )
#        self.dataLoggingClient().addFileRecord( lfn, "RegisterFail", ','.join( catalogs ) if catalogs else "all catalogs", "", "RegisterFile" )

        reason = str( registerFile.get( "Message", registerFile.get( "Value", {} ).get( "Failed", {} ).get( lfn, 'Unknown' ) ) )
        errorStr = "failed to register LFN"
        opFile.Error = "%s: %s" % ( errorStr, reason )
        if 'GUID already registered' in reason:
          opFile.Status = 'Failed'
          self.log.error( errorStr, "%s: %s" % ( lfn, reason ) )
        elif 'File already registered with no replicas' in reason:
          self.log.warn( errorStr, "%s: %s, will remove it and retry" % ( lfn, reason ) )
          dm.removeFile( lfn )
        else:
          self.log.warn( errorStr, "%s: %s" % ( lfn, reason ) )
        failedFiles += 1

      else:

        gMonitor.addMark( "RegisterOK", 1 )
#        self.dataLoggingClient().addFileRecord( lfn, "Register", ','.join( catalogs ) if catalogs else "all catalogs", "", "RegisterFile" )

        self.log.verbose( "file %s has been registered at %s" % ( lfn, ','.join( catalogs ) if catalogs else "all catalogs" ) )
        opFile.Status = "Done"

    # # final check
    if failedFiles:
      self.log.warn( "all files processed, %s files failed to register" % failedFiles )
      self.operation.Error = "some files failed to register"
      return S_ERROR( self.operation.Error )

    return S_OK()
Ejemplo n.º 44
0
    def export_removeFile(self, lfns):
        """Remove the supplied lfns"""
        gMonitor.addMark("RemoveFile", 1)
        res = self.fileCatalogDB.removeFile(lfns, self.getRemoteCredentials())
        if res["OK"]:
            gMonitor.addMark("RemoveFileSuccessful",
                             len(res.get("Value", {}).get("Successful", [])))
            gMonitor.addMark("RemoveFileFailed",
                             len(res.get("Value", {}).get("Failed", [])))

        return res
Ejemplo n.º 45
0
    def export_addReplica(self, lfns):
        """Register supplied replicas"""
        gMonitor.addMark("AddReplica", 1)
        res = self.fileCatalogDB.addReplica(lfns, self.getRemoteCredentials())
        if res["OK"]:
            gMonitor.addMark("AddReplicaSuccessful",
                             len(res.get("Value", {}).get("Successful", [])))
            gMonitor.addMark("AddReplicaFailed",
                             len(res.get("Value", {}).get("Failed", [])))

        return res
Ejemplo n.º 46
0
    def sweeper(cls):
        """move cached request to the central request manager

        :param self: self reference
        """
        cacheDir = cls.cacheDir()

        # # cache dir empty?
        if not os.listdir(cacheDir):
            gLogger.always("sweeper: CacheDir %s is empty, nothing to do" %
                           cacheDir)
            return S_OK()
        else:
            # # read <sweepSize> cache dir files, the oldest first
            cachedRequests = [
                os.path.abspath(requestFile) for requestFile in sorted(
                    filter(os.path.isfile, [
                        os.path.join(cacheDir, requestName)
                        for requestName in os.listdir(cacheDir)
                    ]),
                    key=os.path.getctime,
                )
            ][:cls.sweepSize]
            # # set cached requests to the central RequestManager
            for cachedFile in cachedRequests:
                # # break if something went wrong last time
                try:
                    requestJSON = "".join(open(cachedFile, "r").readlines())
                    cachedRequest = json.loads(requestJSON)
                    cachedName = cachedRequest.get("RequestName",
                                                   "***UNKNOWN***")
                    putRequest = cls.requestManager().putRequest(requestJSON)
                    if not putRequest["OK"]:
                        gLogger.error(
                            "sweeper: unable to set request %s @ ReqManager: %s"
                            % (cachedName, putRequest["Message"]))
                        gMonitor.addMark("reqFailed", 1)

                        continue
                    gLogger.info(
                        "sweeper: successfully put request '%s' @ ReqManager" %
                        cachedName)
                    gMonitor.addMark("reqSwept", 1)
                    os.unlink(cachedFile)
                except Exception as error:
                    gMonitor.addMark("reqFailed", 1)
                    gLogger.exception("sweeper: hit by exception",
                                      lException=error)

            return S_OK()
Ejemplo n.º 47
0
  def __call__( self ):
    """ call me maybe """
    # # counter for failed files
    failedFiles = 0
    # # catalog to use
    catalog = self.operation.Catalog
    dm = DataManager( catalogs = catalog )
    # # get waiting files
    waitingFiles = self.getWaitingFilesList()
    # # loop over files
    for opFile in waitingFiles:

      gMonitor.addMark( "RegisterAtt", 1 )

      # # get LFN
      lfn = opFile.LFN
      # # and others
      fileTuple = ( lfn , opFile.PFN, opFile.Size, self.operation.targetSEList[0], opFile.GUID, opFile.Checksum )
      # # call DataManager
      registerFile = dm.registerFile( fileTuple )
      # # check results
      if not registerFile["OK"] or lfn in registerFile["Value"]["Failed"]:

        gMonitor.addMark( "RegisterFail", 1 )
        self.dataLoggingClient().addFileRecord( lfn, "RegisterFail", catalog, "", "RegisterFile" )

        reason = registerFile.get( "Message", registerFile.get( "Value", {} ).get( "Failed", {} ).get( lfn, 'Unknown' ) )
        errorStr = "failed to register LFN %s: %s" % ( lfn, reason )
        opFile.Error = errorStr
        self.log.warn( errorStr )
        failedFiles += 1

      else:

        gMonitor.addMark( "RegisterOK", 1 )
        self.dataLoggingClient().addFileRecord( lfn, "Register", catalog, "", "RegisterFile" )

        self.log.info( "file %s has been registered at %s" % ( lfn, catalog ) )
        opFile.Status = "Done"

    # # final check
    if failedFiles:
      self.log.info( "all files processed, %s files failed to register" % failedFiles )
      self.operation.Error = "some files failed to register"
      return S_ERROR( self.operation.Error )

    return S_OK()
Ejemplo n.º 48
0
  def sweeper( cls ):
    """ move cached request to the central request manager

    :param self: self reference
    """
    cacheDir = cls.cacheDir()


    # # cache dir empty?
    if not os.listdir( cacheDir ):
      gLogger.always( "sweeper: CacheDir %s is empty, nothing to do" % cacheDir )
      return S_OK()
    else:
      # # read 10 cache dir files, the oldest first
      cachedRequests = [ os.path.abspath( requestFile ) for requestFile in
                         sorted( filter( os.path.isfile,
                                         [ os.path.join( cacheDir, requestName )
                                           for requestName in os.listdir( cacheDir ) ] ),
                                 key = os.path.getctime ) ][:10]
      # # set cached requests to the central RequestManager
      for cachedFile in cachedRequests:
        # # break if something went wrong last time
        try:
          requestJSON = "".join( open( cachedFile, "r" ).readlines() )
          cachedRequest = json.loads( requestJSON )
          cachedName = cachedRequest.get( "RequestName", "***UNKNOWN***" )
          putRequest = cls.requestManager().putRequest( requestJSON )
          if not putRequest["OK"]:
            gLogger.error( "sweeper: unable to set request %s @ ReqManager: %s" % ( cachedName,
                                                                                    putRequest["Message"] ) )
            gMonitor.addMark( "reqFailed", 1 )

            continue
          gLogger.info( "sweeper: successfully put request '%s' @ ReqManager" % cachedName )
          gMonitor.addMark( "reqSwept", 1 )
          os.unlink( cachedFile )
        except Exception as error:
          gMonitor.addMark( "reqFailed", 1 )
          gLogger.exception( "sweeper: hit by exception", lException = error )


      return S_OK()
Ejemplo n.º 49
0
  def ftsTransfer(self):
    """ replicate and register using FTS """

    self.log.info("scheduling files in FTS...")

    bannedTargets = self.checkSEsRSS()
    if not bannedTargets['OK']:
      gMonitor.addMark("FTSScheduleAtt")
      gMonitor.addMark("FTSScheduleFail")
      return bannedTargets

    if bannedTargets['Value']:
      return S_OK("%s targets are banned for writing" % ",".join(bannedTargets['Value']))

    # Can continue now
    self.log.verbose("No targets banned for writing")

    toSchedule = {}

    delayExecution = 0
    errors = defaultdict(int)
    for opFile in self.getWaitingFilesList():
      opFile.Error = ''
      gMonitor.addMark("FTSScheduleAtt")
      # # check replicas
      replicas = self._filterReplicas(opFile)
      if not replicas["OK"]:
        continue
      replicas = replicas["Value"]

      validReplicas = replicas.get("Valid")
      noMetaReplicas = replicas.get("NoMetadata")
      noReplicas = replicas.get('NoReplicas')
      badReplicas = replicas.get('Bad')
      noActiveReplicas = replicas.get('NoActiveReplicas')

      if validReplicas:
        validTargets = list(set(self.operation.targetSEList) - set(validReplicas))
        if not validTargets:
          self.log.info("file %s is already present at all targets" % opFile.LFN)
          opFile.Status = "Done"
        else:
          toSchedule[opFile.LFN] = [opFile, validReplicas, validTargets]
      else:
        gMonitor.addMark("FTSScheduleFail")
        if noMetaReplicas:
          err = "Couldn't get metadata"
          errors[err] += 1
          self.log.verbose(
              "unable to schedule '%s', %s at %s" %
              (opFile.LFN, err, ','.join(noMetaReplicas)))
          opFile.Error = err
        elif noReplicas:
          err = "File doesn't exist"
          errors[err] += 1
          self.log.error("Unable to schedule transfer",
                         "%s %s at %s" % (opFile.LFN, err, ','.join(noReplicas)))
          opFile.Error = err
          opFile.Status = 'Failed'
        elif badReplicas:
          err = "All replicas have a bad checksum"
          errors[err] += 1
          self.log.error("Unable to schedule transfer",
                         "%s, %s at %s" % (opFile.LFN, err, ','.join(badReplicas)))
          opFile.Error = err
          opFile.Status = 'Failed'
        elif noActiveReplicas:
          err = "No active replica found"
          errors[err] += 1
          self.log.verbose("Unable to schedule transfer",
                           "%s, %s at %s" % (opFile.LFN, err, ','.join(noActiveReplicas)))
          opFile.Error = err
          # All source SEs are banned, delay execution by 1 hour
          delayExecution = 60

    if delayExecution:
      self.log.info("Delay execution of the request by %d minutes" % delayExecution)
      self.request.delayNextExecution(delayExecution)
    # Log error counts
    for error, count in errors.iteritems():
      self.log.error(error, 'for %d files' % count)

    filesToScheduleList = []
    res = self._addMetadataToFiles(toSchedule)
    if not res['OK']:
      return res
    else:
      filesToSchedule = res['Value']

      for lfn in filesToSchedule:
        filesToScheduleList.append((filesToSchedule[lfn][0].toJSON()['Value'],
                                    toSchedule[lfn][1],
                                    toSchedule[lfn][2]))

    if filesToScheduleList:

      ftsSchedule = FTSClient().ftsSchedule(self.request.RequestID,
                                            self.operation.OperationID,
                                            filesToScheduleList)
      if not ftsSchedule["OK"]:
        self.log.error("Completely failed to schedule to FTS:", ftsSchedule["Message"])
        return ftsSchedule

      # might have nothing to schedule
      ftsSchedule = ftsSchedule["Value"]
      if not ftsSchedule:
        return S_OK()

      self.log.info("%d files have been scheduled to FTS" % len(ftsSchedule['Successful']))
      for opFile in self.operation:
        fileID = opFile.FileID
        if fileID in ftsSchedule["Successful"]:
          gMonitor.addMark("FTSScheduleOK", 1)
          opFile.Status = "Scheduled"
          self.log.debug("%s has been scheduled for FTS" % opFile.LFN)
        elif fileID in ftsSchedule["Failed"]:
          gMonitor.addMark("FTSScheduleFail", 1)
          opFile.Error = ftsSchedule["Failed"][fileID]
          if 'sourceSURL equals to targetSURL' in opFile.Error:
            # In this case there is no need to continue
            opFile.Status = 'Failed'
          self.log.warn("unable to schedule %s for FTS: %s" % (opFile.LFN, opFile.Error))
    else:
      self.log.info("No files to schedule after metadata checks")

    # Just in case some transfers could not be scheduled, try them with RM
    return self.dmTransfer(fromFTS=True)
Ejemplo n.º 50
0
  def __call__( self ):
    """ perform physical removal operation """
    bannedTargets = self.checkSEsRSS( access = 'RemoveAccess' )
    if not bannedTargets['OK']:
      gMonitor.addMark( "PhysicalRemovalAtt" )
      gMonitor.addMark( "PhysicalRemovalFail" )
      return bannedTargets

    if bannedTargets['Value']:
      return S_OK( "%s targets are banned for removal" % ",".join( bannedTargets['Value'] ) )

    # # get waiting files
    waitingFiles = self.getWaitingFilesList()
    # # prepare lfn dict
    toRemoveDict = dict( ( opFile.LFN, opFile ) for opFile in waitingFiles )

    targetSEs = self.operation.targetSEList
    gMonitor.addMark( "PhysicalRemovalAtt", len( toRemoveDict ) * len( targetSEs ) )

    # # keep errors dict
    removalStatus = dict.fromkeys( toRemoveDict.keys(), None )
    for lfn in removalStatus:
      removalStatus[lfn] = dict.fromkeys( targetSEs, "" )

    for targetSE in targetSEs:

      self.log.info( "removing files from %s" % targetSE )

      # # 1st - bulk removal
      bulkRemoval = self.bulkRemoval( toRemoveDict, targetSE )
      if not bulkRemoval["OK"]:
        self.log.error( 'Failed bulk removal', bulkRemoval["Message"] )
        self.operation.Error = bulkRemoval["Message"]
        return bulkRemoval

      bulkRemoval = bulkRemoval["Value"]

      for lfn, opFile in toRemoveDict.items():
        removalStatus[lfn][targetSE] = bulkRemoval["Failed"].get( lfn, "" )
        opFile.Error = removalStatus[lfn][targetSE]

      # # 2nd - single file removal
      toRetry = dict( ( lfn, opFile ) for lfn, opFile in toRemoveDict.items() if lfn in bulkRemoval["Failed"] )
      for lfn, opFile in toRetry.items():
        self.singleRemoval( opFile, targetSE )
        if not opFile.Error:
          removalStatus[lfn][targetSE] = ""
        else:
          gMonitor.addMark( "PhysicalRemovalFail", 1 )
          removalStatus[lfn][targetSE] = opFile.Error

    # # update file status for waiting files
    failed = 0
    for opFile in self.operation:
      if opFile.Status == "Waiting":
        errors = [ error for error in removalStatus[opFile.LFN].values() if error.strip() ]
        if errors:
          failed += 1
          opFile.Error = ",".join( errors )
          if "Write access not permitted for this credential" in opFile.Error:
            opFile.Status = "Failed"
            gMonitor.addMark( "PhysicalRemovalFail", len( errors ) )
          continue
        gMonitor.addMark( "PhysicalRemovalOK", len( targetSEs ) )
        gMonitor.addMark( "PhysicalRemovalSize", opFile.Size * len( targetSEs ) )
        opFile.Status = "Done"

    if failed:
      self.operation.Error = "failed to remove %s files" % failed

    return S_OK()
Ejemplo n.º 51
0
  def __call__( self ):
    """ action for 'removeFile' operation  """
    # # get waiting files
    waitingFiles = self.getWaitingFilesList()
    fc = FileCatalog( self.operation.catalogList )

    res = fc.getReplicas( [wf.LFN for wf in waitingFiles] )
    if not res['OK']:
      gMonitor.addMark( "RemoveFileAtt" )
      gMonitor.addMark( "RemoveFileFail" )
      return res

    # We check the status of the SE from the LFN that are successful
    # No idea what to do with the others...
    replicas = res['Value']['Successful']
    targetSEs = set( [se for lfn in replicas for se in replicas[lfn] ] )

    bannedTargets = set()
    if targetSEs:
      bannedTargets = self.checkSEsRSS( targetSEs, access = 'RemoveAccess' )
      if not bannedTargets['OK']:
        gMonitor.addMark( "RemoveFileAtt" )
        gMonitor.addMark( "RemoveFileFail" )
        return bannedTargets
      bannedTargets = set( bannedTargets['Value'] )
      if bannedTargets and 'always banned' in self.operation.Error:
        return S_OK( "%s targets are always banned for removal" % ",".join( sorted( bannedTargets ) ) )

    # # prepare waiting file dict
    # # We take only files that have no replica at the banned SEs... If no replica, don't
    toRemoveDict = dict( ( opFile.LFN, opFile ) for opFile in waitingFiles if not bannedTargets.intersection( replicas.get( opFile.LFN, [] ) ) )

    if toRemoveDict:
      gMonitor.addMark( "RemoveFileAtt", len( toRemoveDict ) )
        # # 1st step - bulk removal
      self.log.debug( "bulk removal of %s files" % len( toRemoveDict ) )
      bulkRemoval = self.bulkRemoval( toRemoveDict )
      if not bulkRemoval["OK"]:
        self.log.error( "Bulk file removal failed", bulkRemoval["Message"] )
      else:
        gMonitor.addMark( "RemoveFileOK", len( toRemoveDict ) - len( bulkRemoval["Value"] ) )
        toRemoveDict = bulkRemoval["Value"]

      # # 2nd step - single file removal
      for lfn, opFile in toRemoveDict.items():
        self.log.info( "removing single file %s" % lfn )
        singleRemoval = self.singleRemoval( opFile )
        if not singleRemoval["OK"]:
          self.log.error( 'Error removing single file', singleRemoval["Message"] )
          gMonitor.addMark( "RemoveFileFail", 1 )
        else:
          self.log.info( "file %s has been removed" % lfn )
          gMonitor.addMark( "RemoveFileOK", 1 )

      # # set
      failedFiles = [ ( lfn, opFile ) for ( lfn, opFile ) in toRemoveDict.items()
                      if opFile.Status in ( "Failed", "Waiting" ) ]
      if failedFiles:
        self.operation.Error = "failed to remove %d files" % len( failedFiles )

    if bannedTargets:
      return S_OK( "%s targets are banned for removal" % ",".join( sorted( bannedTargets ) ) )
    return S_OK()
Ejemplo n.º 52
0
def sendNumTaskQueues():
  result = gTaskQueueDB.getNumTaskQueues()
  if result[ 'OK' ]:
    gMonitor.addMark( 'numTQs', result[ 'Value' ] )
  else:
    gLogger.error( "Cannot get the number of task queues", result[ 'Message' ] )
Ejemplo n.º 53
0
  def selectJob( self, resourceDescription ):
    """ Main job selection function to find the highest priority job
        matching the resource capacity
    """

    startTime = time.time()
    resourceDict = self.__processResourceDescription( resourceDescription )

    credDict = self.getRemoteCredentials()
    #Check credentials if not generic pilot
    if Properties.GENERIC_PILOT in credDict[ 'properties' ]:
      #You can only match groups in the same VO
      vo = Registry.getVOForGroup( credDict[ 'group' ] )
      result = Registry.getGroupsForVO( vo )
      if result[ 'OK' ]:
        resourceDict[ 'OwnerGroup' ] = result[ 'Value' ]
    else:
      #If it's a private pilot, the DN has to be the same
      if Properties.PILOT in credDict[ 'properties' ]:
        gLogger.notice( "Setting the resource DN to the credentials DN" )
        resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
      #If it's a job sharing. The group has to be the same and just check that the DN (if any)
      # belongs to the same group
      elif Properties.JOB_SHARING in credDict[ 'properties' ]:
        resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ]
        gLogger.notice( "Setting the resource group to the credentials group" )
        if 'OwnerDN'  in resourceDict and resourceDict[ 'OwnerDN' ] != credDict[ 'DN' ]:
          ownerDN = resourceDict[ 'OwnerDN' ]
          result = Registry.getGroupsForDN( resourceDict[ 'OwnerDN' ] )
          if not result[ 'OK' ] or credDict[ 'group' ] not in result[ 'Value' ]:
            #DN is not in the same group! bad boy.
            gLogger.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN )
            resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
      #Nothing special, group and DN have to be the same
      else:
        resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
        resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ]

    # Check the pilot DIRAC version
    if self.__opsHelper.getValue( "Pilot/CheckVersion", True ):
      if 'ReleaseVersion' not in resourceDict:
        if not 'DIRACVersion' in resourceDict:
          return S_ERROR( 'Version check requested and not provided by Pilot' )
        else:
          pilotVersion = resourceDict['DIRACVersion']
      else:
        pilotVersion = resourceDict['ReleaseVersion']

      validVersions = self.__opsHelper.getValue( "Pilot/Version", [] )
      if validVersions and pilotVersion not in validVersions:
        return S_ERROR( 'Pilot version does not match the production version %s not in ( %s )' % \
                       ( pilotVersion, ",".join( validVersions ) ) )
      #Check project if requested
      validProject = self.__opsHelper.getValue( "Pilot/Project", "" )
      if validProject:
        if 'ReleaseProject' not in resourceDict:
          return S_ERROR( "Version check requested but expected project %s not received" % validProject )
        if resourceDict[ 'ReleaseProject' ] != validProject:
          return S_ERROR( "Version check requested but expected project %s != received %s" % ( validProject,
                                                                                               resourceDict[ 'ReleaseProject' ] ) )

    # Update pilot information
    pilotInfoReported = False
    pilotReference = resourceDict.get( 'PilotReference', '' )
    if pilotReference:
      if "PilotInfoReportedFlag" in resourceDict and not resourceDict['PilotInfoReportedFlag']:
        gridCE = resourceDict.get( 'GridCE', 'Unknown' )
        site = resourceDict.get( 'Site', 'Unknown' )
        benchmark = benchmark = resourceDict.get( 'PilotBenchmark', 0.0 )
        gLogger.verbose('Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % (pilotReference,gridCE,site,benchmark) )
        result = gPilotAgentsDB.setPilotStatus( pilotReference, status = 'Running',
                                                gridSite = site,
                                                destination = gridCE,
                                                benchmark = benchmark )
        if result['OK']:
          pilotInfoReported = True                                        
    
    #Check the site mask
    if not 'Site' in resourceDict:
      return S_ERROR( 'Missing Site Name in Resource JDL' )

    # Get common site mask and check the agent site
    result = gJobDB.getSiteMask( siteState = 'Active' )
    if not result['OK']:
      return S_ERROR( 'Internal error: can not get site mask' )
    maskList = result['Value']

    siteName = resourceDict['Site']
    if siteName not in maskList:
      
      # if 'GridCE' not in resourceDict:
      #  return S_ERROR( 'Site not in mask and GridCE not specified' )
      # Even if the site is banned, if it defines a CE, it must be able to check it
      # del resourceDict['Site']
      
      # Banned site can only take Test jobs 
      resourceDict['JobType'] = 'Test'

    resourceDict['Setup'] = self.serviceInfoDict['clientSetup']

    gLogger.verbose( "Resource description:" )
    for key in resourceDict:
      gLogger.verbose( "%s : %s" % ( key.rjust( 20 ), resourceDict[ key ] ) )

    negativeCond = self.__limiter.getNegativeCondForSite( siteName )
    result = gTaskQueueDB.matchAndGetJob( resourceDict, negativeCond = negativeCond )

    if DEBUG:
      print result

    if not result['OK']:
      return result
    result = result['Value']
    if not result['matchFound']:
      return S_ERROR( 'No match found' )

    jobID = result['jobId']
    resAtt = gJobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] )
    if not resAtt['OK']:
      return S_ERROR( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      return S_ERROR( 'No attributes returned for job' )
    if not resAtt['Value']['Status'] == 'Waiting':
      gLogger.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) )
      result = gTaskQueueDB.deleteJob( jobID )
      if not result[ 'OK' ]:
        return result
      return S_ERROR( "Job %s is not in Waiting state" % str( jobID ) )

    attNames = ['Status','MinorStatus','ApplicationStatus','Site']
    attValues = ['Matched','Assigned','Unknown',siteName]
    result = gJobDB.setJobAttributes( jobID, attNames, attValues )
    # result = gJobDB.setJobStatus( jobID, status = 'Matched', minor = 'Assigned' )
    result = gJobLoggingDB.addLoggingRecord( jobID,
                                           status = 'Matched',
                                           minor = 'Assigned',
                                           source = 'Matcher' )

    result = gJobDB.getJobJDL( jobID )
    if not result['OK']:
      return S_ERROR( 'Failed to get the job JDL' )

    resultDict = {}
    resultDict['JDL'] = result['Value']
    resultDict['JobID'] = jobID

    matchTime = time.time() - startTime
    gLogger.info( "Match time: [%s]" % str( matchTime ) )
    gMonitor.addMark( "matchTime", matchTime )

    # Get some extra stuff into the response returned
    resOpt = gJobDB.getJobOptParameters( jobID )
    if resOpt['OK']:
      for key, value in resOpt['Value'].items():
        resultDict[key] = value
    resAtt = gJobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
    if not resAtt['OK']:
      return S_ERROR( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      return S_ERROR( 'No attributes returned for job' )

    if self.__opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ):
      self.__limiter.updateDelayCounters( siteName, jobID )

    # Report pilot-job association
    if pilotReference:
      result = gPilotAgentsDB.setCurrentJobID( pilotReference, jobID )
      result = gPilotAgentsDB.setJobForPilot( jobID, pilotReference, updateStatus=False )

    resultDict['DN'] = resAtt['Value']['OwnerDN']
    resultDict['Group'] = resAtt['Value']['OwnerGroup']
    resultDict['PilotInfoReportedFlag'] = pilotInfoReported
    return S_OK( resultDict )
Ejemplo n.º 54
0
  def fts3Transfer(self):
    """ replicate and register using FTS3 """

    self.log.info("scheduling files in FTS3...")

    # Check first if we do not have ongoing transfers

    res = self._checkExistingFTS3Operations()
    if not res['OK']:
      return res

    # if res['Value'] is False
    # it means that there are ongoing transfers
    # and we should stop here
    if res['Value'] is False:
      # return S_OK such that the request is put back
      return S_OK()

    fts3Files = []
    toSchedule = {}

    # Dict which maps the FileID to the object
    rmsFilesIds = {}

    for opFile in self.getWaitingFilesList():
      rmsFilesIds[opFile.FileID] = opFile

      opFile.Error = ''
      gMonitor.addMark("FTSScheduleAtt")
      # # check replicas
      replicas = self._filterReplicas(opFile)
      if not replicas["OK"]:
        continue
      replicas = replicas["Value"]

      validReplicas = replicas["Valid"]
      noMetaReplicas = replicas["NoMetadata"]
      noReplicas = replicas['NoReplicas']
      badReplicas = replicas['Bad']
      noPFN = replicas['NoPFN']

      if validReplicas:
        validTargets = list(set(self.operation.targetSEList) - set(validReplicas))
        if not validTargets:
          self.log.info("file %s is already present at all targets" % opFile.LFN)
          opFile.Status = "Done"
        else:
          toSchedule[opFile.LFN] = [opFile, validTargets]

      else:
        gMonitor.addMark("FTSScheduleFail")
        if noMetaReplicas:
          self.log.warn("unable to schedule '%s', couldn't get metadata at %s" % (opFile.LFN, ','.join(noMetaReplicas)))
          opFile.Error = "Couldn't get metadata"
        elif noReplicas:
          self.log.error(
              "Unable to schedule transfer", "File %s doesn't exist at %s" %
              (opFile.LFN, ','.join(noReplicas)))
          opFile.Error = 'No replicas found'
          opFile.Status = 'Failed'
        elif badReplicas:
          self.log.error(
              "Unable to schedule transfer",
              "File %s, all replicas have a bad checksum at %s" %
              (opFile.LFN,
               ','.join(badReplicas)))
          opFile.Error = 'All replicas have a bad checksum'
          opFile.Status = 'Failed'
        elif noPFN:
          self.log.warn(
              "unable to schedule %s, could not get a PFN at %s" %
              (opFile.LFN, ','.join(noPFN)))

    res = self._addMetadataToFiles(toSchedule)
    if not res['OK']:
      return res
    else:
      filesToSchedule = res['Value']

      for lfn in filesToSchedule:
        opFile = filesToSchedule[lfn]
        validTargets = toSchedule[lfn][1]
        for targetSE in validTargets:
          ftsFile = FTS3File.fromRMSFile(opFile, targetSE)
          fts3Files.append(ftsFile)

    if fts3Files:
      res = Registry.getUsernameForDN(self.request.OwnerDN)
      if not res['OK']:
        self.log.error(
            "Cannot get username for DN", "%s %s" %
            (self.request.OwnerDN, res['Message']))
        return res

      username = res['Value']
      fts3Operation = FTS3TransferOperation.fromRMSObjects(self.request, self.operation, username)
      fts3Operation.ftsFiles = fts3Files

      ftsSchedule = FTS3Client().persistOperation(fts3Operation)
      if not ftsSchedule["OK"]:
        self.log.error("Completely failed to schedule to FTS3:", ftsSchedule["Message"])
        return ftsSchedule

      # might have nothing to schedule
      ftsSchedule = ftsSchedule["Value"]
      self.log.info("Scheduled with FTS3Operation id %s" % ftsSchedule)

      self.log.info("%d files have been scheduled to FTS3" % len(fts3Files))

      for ftsFile in fts3Files:
        opFile = rmsFilesIds[ftsFile.rmsFileID]
        gMonitor.addMark("FTSScheduleOK", 1)
        opFile.Status = "Scheduled"
        self.log.debug("%s has been scheduled for FTS" % opFile.LFN)
    else:
      self.log.info("No files to schedule after metadata checks")

    # Just in case some transfers could not be scheduled, try them with RM
    return self.dmTransfer(fromFTS=True)
Ejemplo n.º 55
0
  def dmTransfer(self, fromFTS=False):
    """ replicate and register using dataManager  """
    # # get waiting files. If none just return
    # # source SE
    sourceSE = self.operation.SourceSE if self.operation.SourceSE else None
    if sourceSE:
      # # check source se for read
      bannedSource = self.checkSEsRSS(sourceSE, 'ReadAccess')
      if not bannedSource["OK"]:
        gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation))
        gMonitor.addMark("ReplicateFail", len(self.operation))
        return bannedSource

      if bannedSource["Value"]:
        self.operation.Error = "SourceSE %s is banned for reading" % sourceSE
        self.log.info(self.operation.Error)
        return S_OK(self.operation.Error)

    # # check targetSEs for write
    bannedTargets = self.checkSEsRSS()
    if not bannedTargets['OK']:
      gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation))
      gMonitor.addMark("ReplicateFail", len(self.operation))
      return bannedTargets

    if bannedTargets['Value']:
      self.operation.Error = "%s targets are banned for writing" % ",".join(bannedTargets['Value'])
      return S_OK(self.operation.Error)

    # Can continue now
    self.log.verbose("No targets banned for writing")

    waitingFiles = self.getWaitingFilesList()
    if not waitingFiles:
      return S_OK()
    # # loop over files
    if fromFTS:
      self.log.info("Trying transfer using replica manager as FTS failed")
    else:
      self.log.info("Transferring files using Data manager...")
    errors = defaultdict(int)
    delayExecution = 0
    for opFile in waitingFiles:
      if opFile.Error in ("Couldn't get metadata",
                          "File doesn't exist",
                          'No active replica found',
                          "All replicas have a bad checksum",):
        err = "File already in error status"
        errors[err] += 1

      gMonitor.addMark("ReplicateAndRegisterAtt", 1)
      opFile.Error = ''
      lfn = opFile.LFN

      # Check if replica is at the specified source
      replicas = self._filterReplicas(opFile)
      if not replicas["OK"]:
        self.log.error('Failed to check replicas', replicas["Message"])
        continue
      replicas = replicas["Value"]
      validReplicas = replicas.get("Valid")
      noMetaReplicas = replicas.get("NoMetadata")
      noReplicas = replicas.get('NoReplicas')
      badReplicas = replicas.get('Bad')
      noActiveReplicas = replicas.get('NoActiveReplicas')

      if not validReplicas:
        gMonitor.addMark("ReplicateFail")
        if noMetaReplicas:
          err = "Couldn't get metadata"
          errors[err] += 1
          self.log.verbose(
              "unable to replicate '%s', couldn't get metadata at %s" %
              (opFile.LFN, ','.join(noMetaReplicas)))
          opFile.Error = err
        elif noReplicas:
          err = "File doesn't exist"
          errors[err] += 1
          self.log.verbose(
              "Unable to replicate", "File %s doesn't exist at %s" %
              (opFile.LFN, ','.join(noReplicas)))
          opFile.Error = err
          opFile.Status = 'Failed'
        elif badReplicas:
          err = "All replicas have a bad checksum"
          errors[err] += 1
          self.log.error(
              "Unable to replicate", "%s, all replicas have a bad checksum at %s" %
              (opFile.LFN, ','.join(badReplicas)))
          opFile.Error = err
          opFile.Status = 'Failed'
        elif noActiveReplicas:
          err = "No active replica found"
          errors[err] += 1
          self.log.verbose("Unable to schedule transfer",
                           "%s, %s at %s" % (opFile.LFN, err, ','.join(noActiveReplicas)))
          opFile.Error = err
          # All source SEs are banned, delay execution by 1 hour
          delayExecution = 60
        continue
      # # get the first one in the list
      if sourceSE not in validReplicas:
        if sourceSE:
          err = "File not at specified source"
          errors[err] += 1
          self.log.warn(
              "%s is not at specified sourceSE %s, changed to %s" %
              (lfn, sourceSE, validReplicas[0]))
        sourceSE = validReplicas[0]

      # # loop over targetSE
      catalogs = self.operation.Catalog
      if catalogs:
        catalogs = [cat.strip() for cat in catalogs.split(',')]

      for targetSE in self.operation.targetSEList:

        # # call DataManager
        if targetSE in validReplicas:
          self.log.warn("Request to replicate %s to an existing location: %s" % (lfn, targetSE))
          opFile.Status = 'Done'
          continue
        res = self.dm.replicateAndRegister(lfn, targetSE, sourceSE=sourceSE, catalog=catalogs)
        if res["OK"]:

          if lfn in res["Value"]["Successful"]:

            if "replicate" in res["Value"]["Successful"][lfn]:

              repTime = res["Value"]["Successful"][lfn]["replicate"]
              prString = "file %s replicated at %s in %s s." % (lfn, targetSE, repTime)

              gMonitor.addMark("ReplicateOK", 1)

              if "register" in res["Value"]["Successful"][lfn]:

                gMonitor.addMark("RegisterOK", 1)
                regTime = res["Value"]["Successful"][lfn]["register"]
                prString += ' and registered in %s s.' % regTime
                self.log.info(prString)
              else:

                gMonitor.addMark("RegisterFail", 1)
                prString += " but failed to register"
                self.log.warn(prString)

                opFile.Error = "Failed to register"
                # # add register replica operation
                registerOperation = self.getRegisterOperation(
                    opFile, targetSE, type='RegisterReplica')
                self.request.insertAfter(registerOperation, self.operation)

            else:

              self.log.error("Failed to replicate", "%s to %s" % (lfn, targetSE))
              gMonitor.addMark("ReplicateFail", 1)
              opFile.Error = "Failed to replicate"

          else:

            gMonitor.addMark("ReplicateFail", 1)
            reason = res["Value"]["Failed"][lfn]
            self.log.error(
                "Failed to replicate and register", "File %s at %s:" %
                (lfn, targetSE), reason)
            opFile.Error = reason

        else:

          gMonitor.addMark("ReplicateFail", 1)
          opFile.Error = "DataManager error: %s" % res["Message"]
          self.log.error("DataManager error", res["Message"])

      if not opFile.Error:
        if len(self.operation.targetSEList) > 1:
          self.log.info("file %s has been replicated to all targetSEs" % lfn)
        opFile.Status = "Done"
    # Log error counts
    if delayExecution:
      self.log.info("Delay execution of the request by %d minutes" % delayExecution)
      self.request.delayNextExecution(delayExecution)
    for error, count in errors.iteritems():
      self.log.error(error, 'for %d files' % count)

    return S_OK()
Ejemplo n.º 56
0
  def __call__( self ):
    """ PutAndRegister operation processing """
    # # list of targetSEs

    targetSEs = self.operation.targetSEList

    if len( targetSEs ) != 1:
      self.log.error( "Wrong value for TargetSE list, should contain only one target!", "%s" % targetSEs )
      self.operation.Error = "Wrong parameters: TargetSE should contain only one targetSE"
      for opFile in self.operation:

        opFile.Status = "Failed"
        opFile.Error = "Wrong parameters: TargetSE should contain only one targetSE"

        gMonitor.addMark( "PutAtt", 1 )
        gMonitor.addMark( "PutFail", 1 )

      return S_ERROR( "TargetSE should contain only one target, got %s" % targetSEs )

    targetSE = targetSEs[0]
    bannedTargets = self.checkSEsRSS( targetSE )
    if not bannedTargets['OK']:
      gMonitor.addMark( "PutAtt" )
      gMonitor.addMark( "PutFail" )
      return bannedTargets

    if bannedTargets['Value']:
      return S_OK( "%s targets are banned for writing" % ",".join( bannedTargets['Value'] ) )

    # # get waiting files
    waitingFiles = self.getWaitingFilesList()

    # # loop over files
    for opFile in waitingFiles:
      # # get LFN
      lfn = opFile.LFN
      self.log.info( "processing file %s" % lfn )
      gMonitor.addMark( "PutAtt", 1 )

      pfn = opFile.PFN
      guid = opFile.GUID
      checksum = opFile.Checksum

      # # call DataManager passing a list of requested catalogs
      catalogs = self.operation.Catalog
      if catalogs:
        catalogs = [ cat.strip() for cat in catalogs.split( ',' ) ]
      putAndRegister = DataManager( catalogs = catalogs ).putAndRegister( lfn,
                                                                          pfn,
                                                                          targetSE,
                                                                          guid = guid,
                                                                          checksum = checksum )
      if not putAndRegister["OK"]:
        gMonitor.addMark( "PutFail", 1 )
#         self.dataLoggingClient().addFileRecord( lfn, "PutFail", targetSE, "", "PutAndRegister" )
        self.log.error( "Completely failed to put and register file", putAndRegister["Message"] )
        opFile.Error = str( putAndRegister["Message"] )
        self.operation.Error = str( putAndRegister["Message"] )
        continue

      putAndRegister = putAndRegister["Value"]

      if lfn in putAndRegister["Failed"]:
        gMonitor.addMark( "PutFail", 1 )
#         self.dataLoggingClient().addFileRecord( lfn, "PutFail", targetSE, "", "PutAndRegister" )

        reason = putAndRegister["Failed"][lfn]
        self.log.error( "Failed to put and register file", " %s at %s: %s" % ( lfn, targetSE, reason ) )
        opFile.Error = str( reason )
        self.operation.Error = str( reason )
        continue

      putAndRegister = putAndRegister["Successful"]
      if lfn in putAndRegister:

        if "put" not in putAndRegister[lfn]:

          gMonitor.addMark( "PutFail", 1 )
#           self.dataLoggingClient().addFileRecord( lfn, "PutFail", targetSE, "", "PutAndRegister" )

          self.log.info( "failed to put %s to %s" % ( lfn, targetSE ) )

          opFile.Error = "put failed"
          self.operation.Error = "put failed"
          continue

        if "register" not in putAndRegister[lfn]:

          gMonitor.addMark( "PutOK", 1 )
          gMonitor.addMark( "RegisterFail", 1 )

#           self.dataLoggingClient().addFileRecord( lfn, "Put", targetSE, "", "PutAndRegister" )
#           self.dataLoggingClient().addFileRecord( lfn, "RegisterFail", targetSE, "", "PutAndRegister" )

          self.log.info( "put of %s to %s took %s seconds" % ( lfn, targetSE, putAndRegister[lfn]["put"] ) )
          self.log.error( "Register of lfn to SE failed", "%s to %s" % ( lfn, targetSE ) )

          opFile.Error = "failed to register %s at %s" % ( lfn, targetSE )
          opFile.Status = "Failed"

          self.log.info( opFile.Error )
          registerOperation = self.getRegisterOperation( opFile, targetSE )
          self.request.insertAfter( registerOperation, self.operation )
          continue

        gMonitor.addMark( "PutOK", 1 )
        gMonitor.addMark( "RegisterOK", 1 )

#         self.dataLoggingClient().addFileRecord( lfn, "Put", targetSE, "", "PutAndRegister" )
#         self.dataLoggingClient().addFileRecord( lfn, "Register", targetSE, "", "PutAndRegister" )

        opFile.Status = "Done"
        for op in ( "put", "register" ):
          self.log.info( "%s of %s to %s took %s seconds" % ( op, lfn, targetSE, putAndRegister[lfn][op] ) )

    return S_OK()
Ejemplo n.º 57
0
  def __call__( self ):
    """ call me maybe """
    # # check replicas first
    res = self.__checkReplicas()
    if not res["OK"]:
      self.log.error( 'Failed to check replicas', res["Message"] )

    sourceSE = self.operation.SourceSE if self.operation.SourceSE else None
    if sourceSE:
      # # check source se for read
      bannedSource = self.checkSEsRSS( sourceSE, 'ReadAccess' )
      if not bannedSource["OK"]:
        gMonitor.addMark( "ReplicateAndRegisterAtt", len( self.operation ) )
        gMonitor.addMark( "ReplicateFail", len( self.operation ) )
        return bannedSource

      if bannedSource["Value"]:
        self.operation.Error = "SourceSE %s is banned for reading" % sourceSE
        self.log.info( self.operation.Error )
        return S_OK( self.operation.Error )

    # # check targetSEs for write
    bannedTargets = self.checkSEsRSS()
    if not bannedTargets['OK']:
      gMonitor.addMark( "ReplicateAndRegisterAtt", len( self.operation ) )
      gMonitor.addMark( "ReplicateFail", len( self.operation ) )
      return bannedTargets

    if bannedTargets['Value']:
      self.operation.Error = "%s targets are banned for writing" % ",".join( bannedTargets['Value'] )
      return S_OK( self.operation.Error )

    # Can continue now
    self.log.verbose( "No targets banned for writing" )

    # # check sourceSEs for removal
    # # for removal the targetSEs are the sourceSEs of the replication
    targetSEs = self.operation.sourceSEList
    bannedTargets = self.checkSEsRSS( targetSEs, access = 'RemoveAccess' )
    if not bannedTargets['OK']:
      gMonitor.addMark( "RemoveReplicaAtt" )
      gMonitor.addMark( "RemoveReplicaFail" )
      return bannedTargets

    if bannedTargets['Value']:
      return S_OK( "%s targets are banned for removal" % ",".join( bannedTargets['Value'] ) )

    # Can continue now
    self.log.verbose( "No targets banned for removal" )

    ## Do the transfer
    # # get waiting files. If none just return
    waitingFiles = self.getWaitingFilesList()
    if not waitingFiles:
      return S_OK()

    # # loop over files
    self.log.info( "Transferring files using Data manager..." )
    for opFile in waitingFiles:
      res = self.dmTransfer(opFile)
      if not res["OK"]:
        continue
      else:
        ## Do the replica removal
        self.log.info( "Removing files using Data manager..." )
        toRemoveDict = dict( [ ( opFile.LFN, opFile ) for opFile in waitingFiles ] )
        self.log.info( "todo: %s replicas to delete from %s sites" % ( len( toRemoveDict ), len( targetSEs ) ) )
        self.dmRemoval(toRemoveDict,targetSEs)

    return S_OK()
Ejemplo n.º 58
0
  def selectJob( self, resourceDescription, credDict ):
    """ Main job selection function to find the highest priority job matching the resource capacity
    """

    startTime = time.time()

    resourceDict = self._getResourceDict( resourceDescription, credDict )

    negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] )
    result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond )

    if not result['OK']:
      return result
    result = result['Value']
    if not result['matchFound']:
      self.log.info( "No match found" )
      raise RuntimeError( "No match found" )

    jobID = result['jobId']
    resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] )
    if not resAtt['OK']:
      raise RuntimeError( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      raise RuntimeError( "No attributes returned for job" )
    if not resAtt['Value']['Status'] == 'Waiting':
      self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) )
      result = self.tqDB.deleteJob( jobID )
      if not result[ 'OK' ]:
        return result
      raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) )

    self._reportStatus( resourceDict, jobID )

    result = self.jobDB.getJobJDL( jobID )
    if not result['OK']:
      raise RuntimeError( "Failed to get the job JDL" )

    resultDict = {}
    resultDict['JDL'] = result['Value']
    resultDict['JobID'] = jobID

    matchTime = time.time() - startTime
    self.log.info( "Match time: [%s]" % str( matchTime ) )
    gMonitor.addMark( "matchTime", matchTime )

    # Get some extra stuff into the response returned
    resOpt = self.jobDB.getJobOptParameters( jobID )
    if resOpt['OK']:
      for key, value in resOpt['Value'].items():
        resultDict[key] = value
    resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
    if not resAtt['OK']:
      raise RuntimeError( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      raise RuntimeError( 'No attributes returned for job' )

    if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ):
      self.limiter.updateDelayCounters( resourceDict['Site'], jobID )

    pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False )
    if not pilotInfoReportedFlag:
      self._updatePilotInfo( resourceDict )
    self._updatePilotJobMapping( resourceDict, jobID )

    resultDict['DN'] = resAtt['Value']['OwnerDN']
    resultDict['Group'] = resAtt['Value']['OwnerGroup']
    resultDict['PilotInfoReportedFlag'] = True

    return resultDict
Ejemplo n.º 59
0
  def __call__( self ):
    """ call me maybe """
    # # counter for failed replicas

    failedReplicas = 0
    # # catalog to use
    catalog = self.operation.Catalog
    # # get waiting files
    waitingFiles = self.getWaitingFilesList()
    # # loop over files
    registerOperations = {}
    for opFile in waitingFiles:

      gMonitor.addMark( "RegisterReplicaAtt", 1 )

      # # get LFN
      lfn = opFile.LFN
      # # and others
      targetSE = self.operation.targetSEList[0]
      replicaTuple = ( lfn , opFile.PFN, targetSE )
      # # call ReplicaManager
      registerReplica = self.dm.registerReplica( replicaTuple, catalog )
      # # check results
      if not registerReplica["OK"] or lfn in registerReplica["Value"]["Failed"]:
        # There have been some errors
        gMonitor.addMark( "RegisterReplicaFail", 1 )
        self.dataLoggingClient().addFileRecord( lfn, "RegisterReplicaFail", catalog, "", "RegisterReplica" )

        reason = registerReplica.get( "Message", registerReplica.get( "Value", {} ).get( "Failed", {} ).get( lfn, 'Unknown' ) )
        errorStr = "failed to register LFN %s: %s" % ( lfn, reason )
        if lfn in registerReplica["Value"].get( "Successful", {} ) and type( reason ) == type( {} ):
          # As we managed, let's create a new operation for just the remaining registration
          errorStr += ' - adding registerReplica operations to request'
          for failedCatalog in reason.keys():
            key = '%s/%s' % ( targetSE, failedCatalog )
            newOperation = self.getRegisterOperation( opFile, targetSE, type = 'RegisterReplica', catalog = failedCatalog )
            if key not in registerOperations:
              registerOperations[key] = newOperation
            else:
              registerOperations[key].addFile( newOperation[0] )
          opFile.Status = 'Done'
        else:
          opFile.Error = errorStr
          # If one targets explicitly a catalog and it fails
          if catalog and ( 'file does not exist' in opFile.Error.lower() or 'no such file' in opFile.Error.lower() ) :
            opFile.Status = 'Failed'
          failedReplicas += 1
        self.log.warn( errorStr )

      else:
        # All is OK
        gMonitor.addMark( "RegisterReplicaOK", 1 )
        self.dataLoggingClient().addFileRecord( lfn, "RegisterReplicaOK", catalog, "", "RegisterReplica" )

        self.log.info( "Replica %s has been registered at %s" % ( lfn, catalog ) )
        opFile.Status = "Done"

    # # if we have new replications to take place, put them at the end
    if registerOperations:
      self.log.info( "adding %d operations to the request" % len( registerOperations ) )
    for operation in registerOperations.values():
      self.operation._parent.addOperation( operation )
    # # final check
    if failedReplicas:
      self.log.info( "all replicas processed, %s replicas failed to register" % failedReplicas )
      self.operation.Error = "some replicas failed to register"
      return S_ERROR( self.operation.Error )

    return S_OK()