def prepareNewJobs(self, maxFilesPerJob=100, maxAttemptsPerFile=10): log = gLogger.getSubLogger("_prepareNewJobs", child=True) filesToSubmit = self._getFilesToSubmit(maxAttemptsPerFile=maxAttemptsPerFile) log.debug("%s ftsFiles to submit" % len(filesToSubmit)) newJobs = [] # {targetSE : [FTS3Files] } filesGroupedByTarget = FTS3Utilities.groupFilesByTarget(filesToSubmit) for targetSE, ftsFiles in filesGroupedByTarget.iteritems(): res = self._checkSEAccess(targetSE, 'ReadAccess', vo=self.vo) if not res['OK']: log.error(res) continue for ftsFilesChunk in breakListIntoChunks(ftsFiles, maxFilesPerJob): newJob = self._createNewJob('Staging', ftsFilesChunk, targetSE, sourceSE=targetSE) newJobs.append(newJob) return S_OK(newJobs)
def __init__( self, taskID, timeWait, raiseException=False ): from DIRAC.Core.Base import Script Script.parseCommandLine() from DIRAC.FrameworkSystem.Client.Logger import gLogger self.log = gLogger.getSubLogger( self.__class__.__name__ + "/%s" % taskID ) self.taskID = taskID self.timeWait = timeWait self.raiseException = raiseException
def setUp( self ): from DIRAC.Core.Base import Script Script.parseCommandLine() from DIRAC.FrameworkSystem.Client.Logger import gLogger gLogger.showHeaders( True ) self.log = gLogger.getSubLogger( self.__class__.__name__ ) self.processPool = ProcessPool( 4, 8, 8 ) self.processPool.daemonize()
def __init__(self, timeout=False, bufferLimit=52428800): self.log = gLogger.getSubLogger("Subprocess") self.timeout = False try: self.changeTimeout(timeout) self.bufferLimit = int(bufferLimit) # 5MB limit for data except Exception, x: self.log.exception("Failed initialisation of Subprocess object") raise x
def __init__( self, oTransport = None ): self.oTransport = oTransport self.__oMD5 = md5.md5() self.bFinishedTransmission = False self.bReceivedEOF = False self.direction = False self.packetSize = 1048576 self.__fileBytes = 0 self.__log = gLogger.getSubLogger( "FileHelper" )
def _monitorJob(self, ftsJob): """ * query the FTS servers * update the FTSFile status * update the FTSJob status """ # General try catch to avoid that the tread dies try: threadID = current_process().name log = gLogger.getSubLogger("_monitorJob/%s" % ftsJob.jobID, child=True) res = self.getFTS3Context( ftsJob.username, ftsJob.userGroup, ftsJob.ftsServer, threadID=threadID) if not res['OK']: log.error("Error getting context", res) return ftsJob, res context = res['Value'] res = ftsJob.monitor(context=context) if not res['OK']: log.error("Error monitoring job", res) return ftsJob, res # { fileID : { Status, Error } } filesStatus = res['Value'] # Specify the job ftsGUID to make sure we do not overwrite # status of files already taken by newer jobs res = self.fts3db.updateFileStatus(filesStatus, ftsGUID=ftsJob.ftsGUID) if not res['OK']: log.error("Error updating file fts status", "%s, %s" % (ftsJob.ftsGUID, res)) return ftsJob, res upDict = { ftsJob.jobID: { 'status': ftsJob.status, 'error': ftsJob.error, 'completeness': ftsJob.completeness, 'operationID': ftsJob.operationID, 'lastMonitor': True, } } res = self.fts3db.updateJobStatus(upDict) if ftsJob.status in ftsJob.FINAL_STATES: self.__sendAccounting(ftsJob) return ftsJob, res except Exception as e: return ftsJob, S_ERROR(0, "Exception %s" % repr(e))
def getFTS3Context(self, username, group, ftsServer, threadID): """ Returns an fts3 context for a given user, group and fts server The context pool is per thread, and there is one context per tuple (user, group, server). We dump the proxy of a user to a file (shared by all the threads), and use it to make the context. The proxy needs a lifetime of at least 2h, is cached for 1.5h, and the lifetime of the context is 45mn :param username: name of the user :param group: group of the user :param ftsServer: address of the server :returns: S_OK with the context object """ log = gLogger.getSubLogger("getFTS3Context", child=True) contextes = self._globalContextCache.setdefault(threadID, DictCache()) idTuple = (username, group, ftsServer) log.debug("Getting context for %s" % (idTuple, )) if not contextes.exists(idTuple, 2700): res = getDNForUsername(username) if not res['OK']: return res # We take the first DN returned userDN = res['Value'][0] log.debug("UserDN %s" % userDN) # We dump the proxy to a file. # It has to have a lifetime of at least 2 hours # and we cache it for 1.5 hours res = gProxyManager.downloadVOMSProxyToFile( userDN, group, requiredTimeLeft=7200, cacheTime=5400) if not res['OK']: return res proxyFile = res['Value'] log.debug("Proxy file %s" % proxyFile) # We generate the context res = FTS3Job.generateContext(ftsServer, proxyFile) if not res['OK']: return res context = res['Value'] # we add it to the cache for this thread for 1h contextes.add(idTuple, 3600, context) return S_OK(contextes.get(idTuple))
def _monitorJobCallback(returnedValue): """ Callback when a job has been monitored :param returnedValue: value returned by the _monitorJob method (ftsJob, standard dirac return struct) """ ftsJob, res = returnedValue log = gLogger.getSubLogger("_monitorJobCallback/%s" % ftsJob.jobID, child=True) if not res['OK']: log.error("Error updating job status", res) else: log.debug("Successfully updated job status")
def __init__( self, taskID, timeWait, raiseException=False ): from DIRAC.Core.Base import Script Script.parseCommandLine() from DIRAC.FrameworkSystem.Client.Logger import gLogger self.log = gLogger.getSubLogger( self.__class__.__name__ + "/%s" % taskID ) self.taskID = taskID self.log.always( "pid=%s task=%s I'm locked" % ( os.getpid(), self.taskID ) ) gLock.acquire() self.log.always("you can't see that line, object is stuck by gLock" ) self.timeWait = timeWait self.raiseException = raiseException gLock.release()
def _treatOperationCallback(returnedValue): """ Callback when an operation has been treated :param returnedValue: value returned by the _treatOperation method (ftsOperation, standard dirac return struct) """ operation, res = returnedValue log = gLogger.getSubLogger("_treatOperationCallback/%s" % operation.operationID, child=True) if not res['OK']: log.error("Error treating operation", res) else: log.debug("Successfully treated operation")
def kickJobs(self): """ kick stuck jobs """ log = gLogger.getSubLogger("kickJobs", child=True) res = self.fts3db.kickStuckJobs(limit=self.maxKick, kickDelay=self.kickDelay) if not res['OK']: return res kickedJobs = res['Value'] log.info("Kicked %s stuck jobs" % kickedJobs) return S_OK()
def deleteOperations(self): """ delete final operations """ log = gLogger.getSubLogger("deleteOperations", child=True) res = self.fts3db.deleteFinalOperations(limit=self.maxDelete, deleteDelay=self.deleteDelay) if not res['OK']: return res deletedOperations = res['Value'] log.info("Deleted %s final operations" % deletedOperations) return S_OK()
def __init__( self, timeout = False, bufferLimit = 52428800 ): """ c'tor :param int timeout: timeout in seconds :param int bufferLimit: buffer size, default 5MB """ self.log = gLogger.getSubLogger( 'Subprocess' ) self.timeout = False try: self.changeTimeout( timeout ) self.bufferLimit = int( bufferLimit ) # 5MB limit for data except Exception, x: self.log.exception( 'Failed initialisation of Subprocess object' ) raise x
def setUp( self ): """c'tor :param self: self reference """ from DIRAC.Core.Base import Script Script.parseCommandLine() from DIRAC.FrameworkSystem.Client.Logger import gLogger gLogger.showHeaders( True ) self.log = gLogger.getSubLogger( self.__class__.__name__ ) self.processPool = ProcessPool( 4, 8, 8, poolCallback = self.poolCallback, poolExceptionCallback = self.poolExceptionCallback ) self.processPool.daemonize()
def init_on_load(self): """ This method initializes some attributes. It is called by sqlalchemy (which does not call __init__) """ self._vo = None self.dManager = DataManager() self.rssClient = ResourceStatus() opID = getattr(self, 'operationID', None) loggerName = '%s/' % opID if opID else '' loggerName += 'req_%s/op_%s' % (self.rmsReqID, self.rmsOpID) self._log = gLogger.getSubLogger(loggerName, True)
def _constructRemovalJob(self, context, allTargetSURLs, failedLFNs, target_spacetoken): """ Build a job for removal Some attributes of the job are expected to be set * targetSE * activity (optional) * priority (optional) * filesToSubmit * operationID (optional, used as metadata for the job) :param context: fts3 context :param allTargetSURLs: dict {lfn:surl} for the target :param failedLFNs: set of LFNs in filesToSubmit for which there was a problem :param target_spacetoken: the space token of the target :return: S_OK( (job object, list of ftsFileIDs in the job)) """ log = gLogger.getSubLogger( "constructRemovalJob/%s/%s" % (self.operationID, self.targetSE), True) transfers = [] fileIDsInTheJob = [] for ftsFile in self.filesToSubmit: if ftsFile.lfn in failedLFNs: log.debug("Not preparing transfer for file %s" % ftsFile.lfn) continue transfers.append({'surl': allTargetSURLs[ftsFile.lfn], 'metadata': getattr(ftsFile, 'fileID')}) fileIDsInTheJob.append(getattr(ftsFile, 'fileID')) # We add a few metadata to the fts job so that we can reuse them later on without # querying our DB. # source and target SE are just used for accounting purpose job_metadata = { 'operationID': self.operationID, 'sourceSE': self.sourceSE, 'targetSE': self.targetSE} job = fts3.new_delete_job(transfers, spacetoken=target_spacetoken, metadata=job_metadata) job['params']['retry'] = 3 job['params']['priority'] = self.priority return S_OK((job, fileIDsInTheJob))
def selectUniqueRandomSource(ftsFiles, allowedSources=None): """ For a list of FTS3files object, select a random source, and group the files by source. :param allowedSources: list of allowed sources :param ftsFiles: list of FTS3File object :return: S_OK({ sourceSE: [ FTS3Files] }) """ _log = gLogger.getSubLogger("selectUniqueRandomSource") allowedSourcesSet = set(allowedSources) if allowedSources else set() # destGroup will contain for each target SE a dict { source : [list of FTS3Files] } groupBySource = {} # For all files, check which possible sources they have res = _checkSourceReplicas(ftsFiles) if not res['OK']: return res filteredReplicas = res['Value'] for ftsFile in ftsFiles: if ftsFile.lfn in filteredReplicas['Failed']: _log.error("Failed to get active replicas", "%s,%s" % (ftsFile.lfn, filteredReplicas['Failed'][ftsFile.lfn])) continue replicaDict = filteredReplicas['Successful'][ftsFile.lfn] # Only consider the allowed sources # If we have a restriction, apply it, otherwise take all the replicas allowedReplicaSource = (set(replicaDict) & allowedSourcesSet) if allowedSourcesSet else replicaDict # pick a random source randSource = random.choice(list(allowedReplicaSource)) # one has to convert to list groupBySource.setdefault(randSource, []).append(ftsFile) return S_OK(groupBySource)
def kickOperations(self): """ Kick stuck operations :return: S_OK()/S_ERROR() """ log = gLogger.getSubLogger("kickOperations", child=True) res = self.fts3db.kickStuckOperations(limit=self.maxKick, kickDelay=self.kickDelay) if not res['OK']: return res kickedOperations = res['Value'] log.info("Kicked %s stuck operations" % kickedOperations) return S_OK()
def finalize(self, timeout=60): """ Drain pool, shutdown processing in more or less clean way :param self: self reference :param timeout: seconds to wait before killing """ # # start drainig self.__draining = True # # join deamon process if self.__daemonProcess: self.__daemonProcess.join(timeout) # # process all tasks self.processAllResults(timeout) # # set stop event, all idle workers should be terminated self.__stopEvent.set() # # join idle workers start = time.time() log = gLogger.getSubLogger("ProcessPool/finalize") nWorkers = 9999999 while self.__workersDict: self.__cleanDeadProcesses() if len(self.__workersDict) != nWorkers: nWorkers = len(self.__workersDict) log.debug("%d workers still active, timeout = %d" % (nWorkers, timeout)) if timeout <= 0 or time.time() - start >= timeout: break time.sleep(0.1) # # second clean up - join and terminate workers if self.__workersDict: log.debug( "After cleaning dead processes, %d workers still active, timeout = %d" % (len(self.__workersDict), timeout)) for worker in self.__workersDict.values(): if worker.is_alive(): worker.terminate() worker.join(5) self.__cleanDeadProcesses() # third clean up - kill'em all!!! if self.__workersDict: log.debug( "After terminating processes, %d workers still active, timeout = %d, kill them" % (len(self.__workersDict), timeout)) self.__filicide()
def monitorJobsLoop(self): """ * fetch the active FTSJobs from the DB * spawn a thread to monitor each of them :return: S_OK()/S_ERROR() """ log = gLogger.getSubLogger("monitorJobs", child=True) log.debug("Size of the context cache %s" % len(self._globalContextCache)) log.debug("Getting active jobs") # get jobs from DB res = self.fts3db.getActiveJobs(limit=self.jobBulkSize, jobAssignmentTag=self.assignmentTag) if not res['OK']: log.error("Could not retrieve ftsJobs from the DB", res) return res activeJobs = res['Value'] log.info("%s jobs to queue for monitoring" % len(activeJobs)) # We store here the AsyncResult object on which we are going to wait applyAsyncResults = [] # Starting the monitoring threads for ftsJob in activeJobs: log.debug("Queuing executing of ftsJob %s" % ftsJob.jobID) # queue the execution of self._monitorJob( ftsJob ) in the thread pool # The returned value is passed to _monitorJobCallback applyAsyncResults.append( self.jobsThreadPool.apply_async( self._monitorJob, (ftsJob, ), callback=self._monitorJobCallback)) log.debug("All execution queued") # Waiting for all the monitoring to finish while not all([r.ready() for r in applyAsyncResults]): log.debug("Not all the tasks are finished") time.sleep(0.5) log.debug("All the tasks have completed") return S_OK()
def execute(self): """ One cycle execution :return: S_OK()/S_ERROR() """ log = gLogger.getSubLogger("execute", child=True) log.info("Monitoring job") res = self.monitorJobsLoop() if not res['OK']: log.error("Error monitoring jobs", res) return res log.info("Treating operations") res = self.treatOperationsLoop() if not res['OK']: log.error("Error treating operations", res) return res log.info("Kicking stuck jobs") res = self.kickJobs() if not res['OK']: log.error("Error kicking jobs", res) return res log.info("Kicking stuck operations") res = self.kickOperations() if not res['OK']: log.error("Error kicking operations", res) return res log.info("Deleting final operations") res = self.deleteOperations() if not res['OK']: log.error("Error deleting operations", res) return res return S_OK()
def __init__(self, serverDict, serverPolicy="Random"): """ Call the init of the parent, and initialize the list of FTS3 servers """ self.log = gLogger.getSubLogger("FTS3ServerPolicy") self._serverDict = serverDict self._serverList = serverDict.keys() self._maxAttempts = len(self._serverList) self._nextServerID = 0 self._resourceStatus = ResourceStatus() methName = "_%sServerPolicy" % serverPolicy.lower() if not hasattr(self, methName): self.log.error('Unknown server policy %s. Using Random instead' % serverPolicy) methName = "_randomServerPolicy" self._policyMethod = getattr(self, methName)
def finalize(self): """ finalize processing """ # Joining all the ThreadPools log = gLogger.getSubLogger("Finalize") log.debug("Closing jobsThreadPool") self.jobsThreadPool.close() self.jobsThreadPool.join() log.debug("jobsThreadPool joined") log.debug("Closing opsThreadPool") self.opsThreadPool.close() self.opsThreadPool.join() log.debug("opsThreadPool joined") return S_OK()
def finalize(self): """ finalize processing """ # Joining all the ThreadPools log = gLogger.getSubLogger("Finalize") log.debug("Closing jobsThreadPool") self.jobsThreadPool.close() self.jobsThreadPool.join() log.debug("jobsThreadPool joined") log.debug("Closing opsThreadPool") self.opsThreadPool.close() self.opsThreadPool.join() log.debug("opsThreadPool joined") return S_OK()
def generatePossibleTransfersBySources(ftsFiles, allowedSources=None): """ For a list of FTS3files object, group the transfer possible sources CAUTION ! a given LFN can be in multiple source You still have to choose your source ! :param allowedSources : list of allowed sources :param ftsFiles : list of FTS3File object :return S_OK({ sourceSE: [ FTS3Files] }) """ _log = gLogger.getSubLogger("generatePossibleTransfersBySources", True) # destGroup will contain for each target SE a dict { possible source : transfer metadata } groupBySource = {} # For all files, check which possible sources they have res = _checkSourceReplicas(ftsFiles) if not res['OK']: return res filteredReplicas = res['Value'] for ftsFile in ftsFiles: if ftsFile.lfn in filteredReplicas['Failed']: _log.error("Failed to get active replicas", "%s,%s" % (ftsFile.lfn, filteredReplicas['Failed'][ftsFile.lfn])) continue replicaDict = filteredReplicas['Successful'][ftsFile.lfn] for se in replicaDict: # if we are imposed a source, respect it if allowedSources and se not in allowedSources: continue groupBySource.setdefault(se, []).append(ftsFile) return S_OK(groupBySource)
def execute(self): """ one cycle execution """ log = gLogger.getSubLogger("execute", child=True) log.info("Monitoring job") res = self.monitorJobsLoop() if not res['OK']: log.error("Error monitoring jobs", res) return res log.info("Treating operations") res = self.treatOperationsLoop() if not res['OK']: log.error("Error treating operations", res) return res log.info("Kicking stuck jobs") res = self.kickJobs() if not res['OK']: log.error("Error kicking jobs", res) return res log.info("Kicking stuck operations") res = self.kickOperations() if not res['OK']: log.error("Error kicking operations", res) return res log.info("Deleting final operations") res = self.deleteOperations() if not res['OK']: log.error("Error deleting operations", res) return res return S_OK()
def treatOperationsLoop(self): """ * Fetch all the FTSOperations which are not finished * Spawn a thread to treat each operation """ log = gLogger.getSubLogger("treatOperations", child=True) log.debug("Size of the context cache %s" % len(self._globalContextCache)) log.info("Getting non finished operations") res = self.fts3db.getNonFinishedOperations( limit=self.operationBulkSize, operationAssignmentTag=self.assignmentTag) if not res['OK']: log.error("Could not get incomplete operations", res) return res incompleteOperations = res['Value'] log.info("Treating %s incomplete operations" % len(incompleteOperations)) applyAsyncResults = [] for operation in incompleteOperations: log.debug("Queuing executing of operation %s" % operation.operationID) # queue the execution of self._treatOperation( operation ) in the thread pool # The returned value is passed to _treatOperationCallback applyAsyncResults.append(self.opsThreadPool.apply_async( self._treatOperation, (operation, ), callback=self._treatOperationCallback)) log.debug("All execution queued") # Waiting for all the treatments to finish while not all([r.ready() for r in applyAsyncResults]): log.debug("Not all the tasks are finished") time.sleep(0.5) log.debug("All the tasks have completed") return S_OK()
def treatOperationsLoop(self): """ * Fetch all the FTSOperations which are not finished * Spawn a thread to treat each operation """ log = gLogger.getSubLogger("treatOperations", child=True) log.debug("Size of the context cache %s" % len(self._globalContextCache)) log.info("Getting non finished operations") res = self.fts3db.getNonFinishedOperations( limit=self.operationBulkSize, operationAssignmentTag=self.assignmentTag) if not res['OK']: log.error("Could not get incomplete operations", res) return res incompleteOperations = res['Value'] log.info("Treating %s incomplete operations" % len(incompleteOperations)) applyAsyncResults = [] for operation in incompleteOperations: log.debug("Queuing executing of operation %s" % operation.operationID) # queue the execution of self._treatOperation( operation ) in the thread pool # The returned value is passed to _treatOperationCallback applyAsyncResults.append(self.opsThreadPool.apply_async( self._treatOperation, (operation, ), callback=self._treatOperationCallback)) log.debug("All execution queued") # Waiting for all the treatments to finish while not all([r.ready() for r in applyAsyncResults]): log.debug("Not all the tasks are finished") time.sleep(0.5) log.debug("All the tasks have completed") return S_OK()
def __init__(self, timeout=False, bufferLimit=52428800): """c'tor :param int timeout: timeout in seconds :param int bufferLimit: buffer size, default 5MB """ self.log = gLogger.getSubLogger("Subprocess") self.timeout = False try: self.changeTimeout(timeout) self.bufferLimit = int(bufferLimit) # 5MB limit for data except Exception as x: self.log.exception("Failed initialisation of Subprocess object") raise x self.child = None self.childPID = 0 self.childKilled = False self.callback = None self.bufferList = [] self.cmdSeq = []
def monitorJobsLoop(self): """ * fetch the active FTSJobs from the DB * spawn a thread to monitor each of them """ log = gLogger.getSubLogger("monitorJobs", child=True) log.debug("Size of the context cache %s" % len(self._globalContextCache)) log.debug("Getting active jobs") # get jobs from DB res = self.fts3db.getActiveJobs(limit=self.jobBulkSize, jobAssignmentTag=self.assignmentTag) if not res['OK']: log.error("Could not retrieve ftsJobs from the DB", res) return res activeJobs = res['Value'] log.info("%s jobs to queue for monitoring" % len(activeJobs)) # We store here the AsyncResult object on which we are going to wait applyAsyncResults = [] # Starting the monitoring threads for ftsJob in activeJobs: log.debug("Queuing executing of ftsJob %s" % ftsJob.jobID) # queue the execution of self._monitorJob( ftsJob ) in the thread pool # The returned value is passed to _monitorJobCallback applyAsyncResults.append(self.jobsThreadPool.apply_async( self._monitorJob, (ftsJob, ), callback=self._monitorJobCallback)) log.debug("All execution queued") # Waiting for all the monitoring to finish while not all([r.ready() for r in applyAsyncResults]): log.debug("Not all the tasks are finished") time.sleep(0.5) log.debug("All the tasks have completed") return S_OK()
def treatOperationsLoop(self): """ * Fetch all the FTSOperations which are not finished * Spawn a thread to treat each operation """ log = gLogger.getSubLogger("treatOperations", child=True) thPool = ThreadPool(self.maxNumberOfThreads) log.info("Getting non finished operations") res = self.fts3db.getNonFinishedOperations( limit=self.operationBulkSize, operationAssignmentTag=self.assignmentTag) if not res['OK']: log.error("Could not get incomplete operations", res) return res incompleteOperations = res['Value'] log.info("Treating %s incomplete operations" % len(incompleteOperations)) for operation in incompleteOperations: log.debug("Queuing executing of operation %s" % operation.operationID) # queue the execution of self._treatOperation( operation ) in the thread pool # The returned value is passed to _treatOperationCallback thPool.apply_async(self._treatOperation, (operation, ), callback=self._treatOperationCallback) log.debug("All execution queued") # Waiting for all the treatments to finish thPool.close() thPool.join() log.debug("thPool joined") return S_OK()
def selectUniqueRandomSource(ftsFiles, allowedSources=None): """ For a list of FTS3files object, select a random source, and group the files by source. :param allowedSources : list of allowed sources :param ftsFiles : list of FTS3File object :return: S_OK({ sourceSE: [ FTS3Files] }) """ _log = gLogger.getSubLogger("selectUniqueRandomSource") # destGroup will contain for each target SE a dict { source : [list of FTS3Files] } groupBySource = {} # For all files, check which possible sources they have res = _checkSourceReplicas(ftsFiles) if not res['OK']: return res filteredReplicas = res['Value'] for ftsFile in ftsFiles: if ftsFile.lfn in filteredReplicas['Failed']: _log.error("Failed to get active replicas", "%s,%s" % (ftsFile.lfn, filteredReplicas['Failed'][ftsFile.lfn])) continue replicaDict = filteredReplicas['Successful'][ftsFile.lfn] # pick a random source randSource = random.choice(list(replicaDict)) # one has to convert to list groupBySource.setdefault(randSource, []).append(ftsFile) return S_OK(groupBySource)
def init_on_load(self): """This method initializes some attributes. It is called by sqlalchemy (which does not call __init__) """ self._vo = None # Note that in the case of an FTS3Operation created from an RMS # object, the members here will probably be "wrong" in the sense # that the VO will not be known by then. # It does not really matter however, since we do not perform anything # on an operation created this way, it's just to be then serialized # in the DB. self.dManager = DataManager() self.rssClient = ResourceStatus() self.fts3Plugin = FTS3Utilities.getFTS3Plugin(vo=self.vo) opID = getattr(self, "operationID", None) loggerName = "%s/" % opID if opID else "" loggerName += "req_%s/op_%s" % (self.rmsReqID, self.rmsOpID) self._log = gLogger.getSubLogger(loggerName)
def monitorJobsLoop(self): """ * fetch the active FTSJobs from the DB * spawn a thread to monitor each of them """ log = gLogger.getSubLogger("monitorJobs", child=True) thPool = ThreadPool(self.maxNumberOfThreads) log.debug("Getting active jobs") # get jobs from DB res = self.fts3db.getActiveJobs(limit=self.jobBulkSize, jobAssignmentTag=self.assignmentTag) if not res['OK']: log.error("Could not retrieve ftsJobs from the DB", res) return res activeJobs = res['Value'] log.info("%s jobs to queue for monitoring" % len(activeJobs)) # Starting the monitoring threads for ftsJob in activeJobs: log.debug("Queuing executing of ftsJob %s" % ftsJob.jobID) # queue the execution of self._monitorJob( ftsJob ) in the thread pool # The returned value is passed to _monitorJobCallback thPool.apply_async(self._monitorJob, (ftsJob, ), callback=self._monitorJobCallback) log.debug("All execution queued") # Waiting for all the monitoring to finish thPool.close() thPool.join() log.debug("thPool joined") return S_OK()
def _treatOperation(self, operation): """ Treat one operation: * does the callback if the operation is finished * generate new jobs and submits them :param operation: the operation to treat :return: operation, S_OK()/S_ERROR() """ try: threadID = current_process().name log = gLogger.getSubLogger("treatOperation/%s" % operation.operationID, child=True) # If the operation is totally processed # we perform the callback if operation.isTotallyProcessed(): log.debug("FTS3Operation %s is totally processed" % operation.operationID) res = operation.callback() if not res['OK']: log.error("Error performing the callback", res) log.info("Putting back the operation") dbRes = self.fts3db.persistOperation(operation) if not dbRes['OK']: log.error("Could not persist operation", dbRes) return operation, res else: log.debug("FTS3Operation %s is not totally processed yet" % operation.operationID) # This flag is set to False if we want to stop the ongoing processing # of an operation, typically when the matching RMS Request has been # canceled (see below) continueOperationProcessing = True # Check the status of the associated RMS Request. # If it is canceled then we will not create new FTS3Jobs, and mark # this as FTS3Operation canceled. if operation.rmsReqID: res = ReqClient().getRequestStatus(operation.rmsReqID) if not res['OK']: log.error("Could not get request status", res) return operation, res rmsReqStatus = res['Value'] if rmsReqStatus == 'Canceled': log.info( "The RMS Request is canceled, canceling the FTS3Operation", "rmsReqID: %s, FTS3OperationID: %s" % (operation.rmsReqID, operation.operationID)) operation.status = 'Canceled' continueOperationProcessing = False if continueOperationProcessing: res = operation.prepareNewJobs( maxFilesPerJob=self.maxFilesPerJob, maxAttemptsPerFile=self.maxAttemptsPerFile) if not res['OK']: log.error( "Cannot prepare new Jobs", "FTS3Operation %s : %s" % (operation.operationID, res)) return operation, res newJobs = res['Value'] log.debug("FTS3Operation %s: %s new jobs to be submitted" % (operation.operationID, len(newJobs))) for ftsJob in newJobs: res = self._serverPolicy.chooseFTS3Server() if not res['OK']: log.error(res) continue ftsServer = res['Value'] log.debug("Use %s server" % ftsServer) ftsJob.ftsServer = ftsServer res = self.getFTS3Context(ftsJob.username, ftsJob.userGroup, ftsServer, threadID=threadID) if not res['OK']: log.error("Could not get context", res) continue context = res['Value'] res = ftsJob.submit(context=context, protocols=self.thirdPartyProtocols) if not res['OK']: log.error( "Could not submit FTS3Job", "FTS3Operation %s : %s" % (operation.operationID, res)) continue operation.ftsJobs.append(ftsJob) submittedFileIds = res['Value'] log.info( "FTS3Operation %s: Submitted job for %s transfers" % (operation.operationID, len(submittedFileIds))) # new jobs are put in the DB at the same time res = self.fts3db.persistOperation(operation) if not res['OK']: log.error("Could not persist operation", res) return operation, res except Exception as e: log.exception('Exception in the thread', repr(e)) return operation, S_ERROR("Exception %s" % repr(e))
def monitorJobsLoop(self): """* fetch the active FTSJobs from the DB * spawn a thread to monitor each of them :return: S_OK()/S_ERROR() """ log = gLogger.getSubLogger("monitorJobs") log.debug("Size of the context cache %s" % len(self._globalContextCache)) # Find the number of loops nbOfLoops, mod = divmod(self.jobBulkSize, JOB_MONITORING_BATCH_SIZE) if mod: nbOfLoops += 1 log.debug("Getting active jobs") for loopId in range(nbOfLoops): log.info("Getting next batch of jobs to monitor", "%s/%s" % (loopId, nbOfLoops)) # get jobs from DB res = self.fts3db.getActiveJobs( limit=JOB_MONITORING_BATCH_SIZE, jobAssignmentTag=self.assignmentTag) if not res["OK"]: log.error("Could not retrieve ftsJobs from the DB", res) return res activeJobs = res["Value"] log.info("Jobs queued for monitoring", len(activeJobs)) # We store here the AsyncResult object on which we are going to wait applyAsyncResults = [] # Starting the monitoring threads for ftsJob in activeJobs: log.debug("Queuing executing of ftsJob %s" % ftsJob.jobID) # queue the execution of self._monitorJob( ftsJob ) in the thread pool # The returned value is passed to _monitorJobCallback applyAsyncResults.append( self.jobsThreadPool.apply_async( self._monitorJob, (ftsJob, ), callback=self._monitorJobCallback)) log.debug("All execution queued") # Waiting for all the monitoring to finish while not all([r.ready() for r in applyAsyncResults]): log.debug("Not all the tasks are finished") time.sleep(0.5) # If we got less to monitor than what we asked, # stop looping if len(activeJobs) < JOB_MONITORING_BATCH_SIZE: break log.debug("All the tasks have completed") return S_OK()
def submit(self, context=None, ftsServer=None, ucert=None, pinTime=36000, protocols=None): """ submit the job to the FTS server Some attributes are expected to be defined for the submission to work: * type (set by FTS3Operation) * sourceSE (only for Transfer jobs) * targetSE * activity (optional) * priority (optional) * username * userGroup * filesToSubmit * operationID (optional, used as metadata for the job) We also expect the FTSFiles have an ID defined, as it is given as transfer metadata :param pinTime: Time the file should be pinned on disk (used for transfers and staging) Used only if he source SE is a tape storage :param context: fts3 context. If not given, it is created (see ftsServer & ucert param) :param ftsServer: the address of the fts server to submit to. Used only if context is not given. if not given either, use the ftsServer object attribute :param ucert: path to the user certificate/proxy. Might be inferred by the fts cli (see its doc) :param protocols: list of protocols from which we should choose the protocol to use :returns S_OK([FTSFiles ids of files submitted]) """ log = gLogger.getSubLogger("submit/%s/%s_%s" % (self.operationID, self.sourceSE, self.targetSE), True) if not context: if not ftsServer: ftsServer = self.ftsServer context = fts3.Context( endpoint=ftsServer, ucert=ucert, request_class=ftsSSLRequest, verify=False) # Construct the target SURL res = self.__fetchSpaceToken(self.targetSE) if not res['OK']: return res target_spacetoken = res['Value'] allLFNs = [ftsFile.lfn for ftsFile in self.filesToSubmit] if self.type == 'Transfer': res = self._constructTransferJob( pinTime, allLFNs, target_spacetoken, protocols=protocols) elif self.type == 'Staging': res = self._constructStagingJob( pinTime, allLFNs, target_spacetoken) # elif self.type == 'Removal': # res = self._constructRemovalJob(context, allLFNs, failedLFNs, target_spacetoken) if not res['OK']: return res job, fileIDsInTheJob = res['Value'] setFileIdsInTheJob = set(fileIDsInTheJob) try: self.ftsGUID = fts3.submit(context, job) log.info("Got GUID %s" % self.ftsGUID) # Only increase the amount of attempt # if we succeeded in submitting -> no ! Why did I do that ?? for ftsFile in self.filesToSubmit: ftsFile.attempt += 1 # This should never happen because a file should be "released" # first by the previous job. # But we just print a warning if ftsFile.ftsGUID is not None: log.warn( "FTSFile has a non NULL ftsGUID at job submission time", "FileID: %s existing ftsGUID: %s" % (ftsFile.fileID, ftsFile.ftsGUID)) # `assign` the file to this job ftsFile.ftsGUID = self.ftsGUID if ftsFile.fileID in setFileIdsInTheJob: ftsFile.status = 'Submitted' now = datetime.datetime.utcnow().replace(microsecond=0) self.submitTime = now self.lastUpdate = now self.lastMonitor = now except FTS3ClientException as e: log.exception("Error at submission", repr(e)) return S_ERROR("Error at submission: %s" % e) return S_OK(fileIDsInTheJob)
def _constructStagingJob(self, pinTime, allLFNs, target_spacetoken): """ Build a job for staging Some attributes of the job are expected to be set * targetSE * activity (optional) * priority (optional) * filesToSubmit * operationID (optional, used as metadata for the job) :param pinTime: pining time in case staging is needed :param allLFNs: List of LFNs to stage :param failedLFNs: set of LFNs in filesToSubmit for which there was a problem :param target_spacetoken: the space token of the target :return: S_OK( (job object, list of ftsFileIDs in the job)) """ log = gLogger.getSubLogger( "constructStagingJob/%s/%s" % (self.operationID, self.targetSE), True) transfers = [] fileIDsInTheJob = [] # Set of LFNs for which we did not get an SRM URL failedLFNs = set() # getting all the target surls res = StorageElement(self.targetSE, vo=self.vo).getURL(allLFNs, protocol='srm') if not res['OK']: return res for lfn, reason in res['Value']['Failed'].iteritems(): failedLFNs.add(lfn) log.error("Could not get target SURL", "%s %s" % (lfn, reason)) allTargetSURLs = res['Value']['Successful'] for ftsFile in self.filesToSubmit: if ftsFile.lfn in failedLFNs: log.debug("Not preparing transfer for file %s" % ftsFile.lfn) continue sourceSURL = targetSURL = allTargetSURLs[ftsFile.lfn] trans = fts3.new_transfer(sourceSURL, targetSURL, checksum='ADLER32:%s' % ftsFile.checksum, filesize=ftsFile.size, metadata=getattr(ftsFile, 'fileID'), activity=self.activity) transfers.append(trans) fileIDsInTheJob.append(getattr(ftsFile, 'fileID')) # If the source is not an tape SE, we should set the # copy_pin_lifetime and bring_online params to None, # otherwise they will do an extra useless queue in FTS sourceIsTape = self.__isTapeSE(self.sourceSE) copy_pin_lifetime = pinTime if sourceIsTape else None bring_online = 86400 if sourceIsTape else None # We add a few metadata to the fts job so that we can reuse them later on without # querying our DB. # source and target SE are just used for accounting purpose job_metadata = { 'operationID': self.operationID, 'sourceSE': self.sourceSE, 'targetSE': self.targetSE} job = fts3.new_job(transfers=transfers, overwrite=True, source_spacetoken=target_spacetoken, spacetoken=target_spacetoken, bring_online=bring_online, copy_pin_lifetime=copy_pin_lifetime, retry=3, metadata=job_metadata, priority=self.priority) return S_OK((job, fileIDsInTheJob))
def __init__( self, requestString, requestName, executionOrder, jobID, configPath ): """ c'tor :param self: self reference :param str requestString: XML serialised RequestContainer :param str requestName: request name :param list executionOrder: request execution order :param int jobID: jobID :param str sourceServer: request's source server :param str configPath: path in CS for parent agent """ ## fixtures ## python fixtures import os, os.path, sys, time, re, types self.makeGlobal( "os", os ) self.makeGlobal( "os.path", os.path ) self.makeGlobal( "sys", sys ) self.makeGlobal( "time", time ) self.makeGlobal( "re", re ) ## export all Types from types [ self.makeGlobal( item, getattr( types, item ) ) for item in dir(types) if "Type" in item ] ## DIRAC fixtures from DIRAC.FrameworkSystem.Client.Logger import gLogger self.__log = gLogger.getSubLogger( "%s/%s" % ( self.__class__.__name__, str(requestName) ) ) self.always = self.__log.always self.notice = self.__log.notice self.info = self.__log.info self.debug = self.__log.debug self.warn = self.__log.warn self.error = self.__log.error self.exception = self.__log.exception self.fatal = self.__log.fatal from DIRAC import S_OK, S_ERROR from DIRAC.ConfigurationSystem.Client.Config import gConfig from DIRAC.FrameworkSystem.Client.ProxyManagerClient import gProxyManager from DIRAC.ConfigurationSystem.Client.Helpers.Registry import getGroupsWithVOMSAttribute from DIRAC.ConfigurationSystem.Client.ConfigurationData import gConfigurationData ## export DIRAC global tools and functions self.makeGlobal( "S_OK", S_OK ) self.makeGlobal( "S_ERROR", S_ERROR ) self.makeGlobal( "gLogger", gLogger ) self.makeGlobal( "gConfig", gConfig ) self.makeGlobal( "gProxyManager", gProxyManager ) self.makeGlobal( "getGroupsWithVOMSAttribute", getGroupsWithVOMSAttribute ) self.makeGlobal( "gConfigurationData", gConfigurationData ) ## save request string self.requestString = requestString ## build request object from DIRAC.RequestManagementSystem.Client.RequestContainer import RequestContainer self.requestObj = RequestContainer( init = False ) self.requestObj.parseRequest( request = self.requestString ) ## save request name self.requestName = requestName ## .. and jobID self.jobID = jobID ## .. and execution order self.executionOrder = executionOrder ## save config path self.__configPath = configPath ## set requestType self.setRequestType( gConfig.getValue( os.path.join( configPath, "RequestType" ), "" ) ) ## get log level self.__log.setLevel( gConfig.getValue( os.path.join( configPath, self.__class__.__name__, "LogLevel" ), "INFO" ) ) ## clear monitoring self.__monitor = {} ## save DataManager proxy if "X509_USER_PROXY" in os.environ: self.info("saving path to current proxy file") self.__dataManagerProxy = os.environ["X509_USER_PROXY"] else: self.error("'X509_USER_PROXY' environment variable not set")
LockRing = None try: from DIRAC.Core.Utilities.ReturnValues import S_OK, S_ERROR except ImportError: def S_OK(val=""): """ dummy S_OK """ return {'OK': True, 'Value': val} def S_ERROR(mess): """ dummy S_ERROR """ return {'OK': False, 'Message': mess} sLog = gLogger.getSubLogger(__name__) class WorkingProcess(multiprocessing.Process): """ .. class:: WorkingProcess WorkingProcess is a class that represents activity that runs in a separate process. It is running main thread (process) in daemon mode, reading tasks from :pendingQueue:, executing them and pushing back tasks with results to the :resultsQueue:. If task has got a timeout value defined a separate threading.Timer thread is started killing execution (and destroying worker) after :ProcessTask.__timeOut: seconds. Main execution could also terminate in a few different ways:
def submit(self, context=None, ftsServer=None, ucert=None, pinTime=36000, protocols=None): """ submit the job to the FTS server Some attributes are expected to be defined for the submission to work: * type (set by FTS3Operation) * sourceSE (only for Transfer jobs) * targetSE * activity (optional) * priority (optional) * username * userGroup * filesToSubmit * operationID (optional, used as metadata for the job) We also expect the FTSFiles have an ID defined, as it is given as transfer metadata :param pinTime: Time the file should be pinned on disk (used for transfers and staging) Used only if he source SE is a tape storage :param context: fts3 context. If not given, it is created (see ftsServer & ucert param) :param ftsServer: the address of the fts server to submit to. Used only if context is not given. if not given either, use the ftsServer object attribute :param ucert: path to the user certificate/proxy. Might be inferred by the fts cli (see its doc) :param protocols: list of protocols from which we should choose the protocol to use :returns S_OK([FTSFiles ids of files submitted]) """ log = gLogger.getSubLogger( "submit/%s/%s_%s" % (self.operationID, self.sourceSE, self.targetSE), True) if not context: if not ftsServer: ftsServer = self.ftsServer context = fts3.Context(endpoint=ftsServer, ucert=ucert, request_class=ftsSSLRequest, verify=False) # Construct the target SURL res = self.__fetchSpaceToken(self.targetSE) if not res['OK']: return res target_spacetoken = res['Value'] allLFNs = [ftsFile.lfn for ftsFile in self.filesToSubmit] if self.type == 'Transfer': res = self._constructTransferJob(pinTime, allLFNs, target_spacetoken, protocols=protocols) elif self.type == 'Staging': res = self._constructStagingJob(pinTime, allLFNs, target_spacetoken) # elif self.type == 'Removal': # res = self._constructRemovalJob(context, allLFNs, failedLFNs, target_spacetoken) if not res['OK']: return res job, fileIDsInTheJob = res['Value'] setFileIdsInTheJob = set(fileIDsInTheJob) try: self.ftsGUID = fts3.submit(context, job) log.info("Got GUID %s" % self.ftsGUID) # Only increase the amount of attempt # if we succeeded in submitting -> no ! Why did I do that ?? for ftsFile in self.filesToSubmit: ftsFile.attempt += 1 if ftsFile.fileID in setFileIdsInTheJob: ftsFile.status = 'Submitted' now = datetime.datetime.utcnow().replace(microsecond=0) self.submitTime = now self.lastUpdate = now self.lastMonitor = now except FTS3ClientException as e: log.exception("Error at submission", repr(e)) return S_ERROR("Error at submission: %s" % e) return S_OK(fileIDsInTheJob)
import threading import six from six import StringIO from DIRAC.Core.Utilities.ReturnValues import S_OK, S_ERROR from DIRAC.FrameworkSystem.Client.Logger import gLogger try: # Python 2: "file" is built-in file_types = file, io.IOBase except NameError: # Python 3: "file" fully replaced with IOBase file_types = (io.IOBase, ) gLogger = gLogger.getSubLogger("FileTransmissionHelper") class FileHelper(object): __validDirections = ("toClient", "fromClient", "receive", "send") __directionsMapping = {"toClient": "send", "fromClient": "receive"} def __init__(self, oTransport=None, checkSum=True): self.oTransport = oTransport self.__checkMD5 = checkSum self.__oMD5 = hashlib.md5() self.bFinishedTransmission = False self.bReceivedEOF = False self.direction = False self.packetSize = 1048576
def processResults(self): """ Execute tasks' callbacks removing them from results queue :param self: self reference """ processed = 0 log = gLogger.getSubLogger('ProcessPool') while True: if (not log.debug( "Start loop (t=0) queue size = %d, processed = %d" % (self.__resultsQueue.qsize(), processed)) and processed == 0 and self.__resultsQueue.qsize()): log.info("Process results, queue size = %d" % self.__resultsQueue.qsize()) start = time.time() self.__cleanDeadProcesses() log.debug("__cleanDeadProcesses", 't=%.2f' % (time.time() - start)) if not self.__pendingQueue.empty(): self.__spawnNeededWorkingProcesses() log.debug("__spawnNeededWorkingProcesses", 't=%.2f' % (time.time() - start)) time.sleep(0.1) if self.__resultsQueue.empty(): if self.__resultsQueue.qsize(): log.warn( "Results queue is empty but has non zero size: %d" % self.__resultsQueue.qsize()) # We only commit suicide if we reach a backlog greater than the maximum number of workers if self.__resultsQueue.qsize() > self.__maxSize: return -1 else: return 0 if processed == 0: log.verbose("Process results, but queue is empty...") break # # get task task = self.__resultsQueue.get() log.debug("__resultsQueue.get", 't=%.2f' % (time.time() - start)) # # execute callbacks try: task.doExceptionCallback() task.doCallback() log.debug("doCallback", 't=%.2f' % (time.time() - start)) if task.usePoolCallbacks(): if self.__poolExceptionCallback and task.exceptionRaised(): self.__poolExceptionCallback(task.getTaskID(), task.taskException()) if self.__poolCallback and task.taskResults(): self.__poolCallback(task.getTaskID(), task.taskResults()) log.debug("__poolCallback", 't=%.2f' % (time.time() - start)) except Exception as error: log.exception("Exception in callback", lException=error) pass processed += 1 if processed: log.info("Processed %d results" % processed) else: log.debug("No results processed") return processed
class AuthManager( object ): """ Handle Service Authorization """ __authLogger = gLogger.getSubLogger( "Authorization" ) KW_HOSTS_GROUP = 'hosts' KW_DN = 'DN' KW_GROUP = 'group' KW_EXTRA_CREDENTIALS = 'extraCredentials' KW_PROPERTIES = 'properties' KW_USERNAME = '******' def __init__( self, authSection ): """ Constructor :type authSection: string :param authSection: Section containing the authorization rules """ self.authSection = authSection def authQuery( self, methodQuery, credDict, defaultProperties = False ): """ Check if the query is authorized for a credentials dictionary :type methodQuery: string :param methodQuery: Method to test :type credDict: dictionary :param credDict: dictionary containing credentials for test. The dictionary can contain the DN and selected group. :return: Boolean result of test """ userString = "" if self.KW_DN in credDict: userString += "DN=%s" % credDict[ self.KW_DN ] if self.KW_GROUP in credDict: userString += " group=%s" % credDict[ self.KW_GROUP ] if self.KW_EXTRA_CREDENTIALS in credDict: userString += " extraCredentials=%s" % str( credDict[ self.KW_EXTRA_CREDENTIALS ] ) self.__authLogger.verbose( "Trying to authenticate %s" % userString ) # Get properties requiredProperties = self.getValidPropertiesForMethod( methodQuery, defaultProperties ) # Extract valid groups validGroups = self.getValidGroups( requiredProperties ) lowerCaseProperties = [ prop.lower() for prop in requiredProperties ] if not lowerCaseProperties: lowerCaseProperties = ['any'] allowAll = "any" in lowerCaseProperties or "all" in lowerCaseProperties #Set no properties by default credDict[ self.KW_PROPERTIES ] = [] #Check non secure backends if self.KW_DN not in credDict or not credDict[ self.KW_DN ]: if allowAll and not validGroups: self.__authLogger.verbose( "Accepted request from unsecure transport" ) return True else: self.__authLogger.verbose( "Explicit property required and query seems to be coming through an unsecure transport" ) return False #Check if query comes though a gateway/web server if self.forwardedCredentials( credDict ): self.__authLogger.verbose( "Query comes from a gateway" ) self.unpackForwardedCredentials( credDict ) return self.authQuery( methodQuery, credDict ) #Get the properties #Check for invalid forwarding if self.KW_EXTRA_CREDENTIALS in credDict: #Invalid forwarding? if not isinstance ( credDict[ self.KW_EXTRA_CREDENTIALS ], basestring ): self.__authLogger.verbose( "The credentials seem to be forwarded by a host, but it is not a trusted one" ) return False #Is it a host? if self.KW_EXTRA_CREDENTIALS in credDict and credDict[ self.KW_EXTRA_CREDENTIALS ] == self.KW_HOSTS_GROUP: #Get the nickname of the host credDict[ self.KW_GROUP ] = credDict[ self.KW_EXTRA_CREDENTIALS ] #HACK TO MAINTAIN COMPATIBILITY else: if self.KW_EXTRA_CREDENTIALS in credDict and self.KW_GROUP not in credDict: credDict[ self.KW_GROUP ] = credDict[ self.KW_EXTRA_CREDENTIALS ] #END OF HACK #Get the username if self.KW_DN in credDict and credDict[ self.KW_DN ]: if self.KW_GROUP not in credDict: result = CS.findDefaultGroupForDN( credDict[ self.KW_DN ] ) if not result['OK']: return False credDict[ self.KW_GROUP ] = result['Value'] if credDict[ self.KW_GROUP ] == self.KW_HOSTS_GROUP: #For host if not self.getHostNickName( credDict ): self.__authLogger.warn( "Host is invalid" ) if not allowAll: return False #If all, then set anon credentials credDict[ self.KW_USERNAME ] = "anonymous" credDict[ self.KW_GROUP ] = "visitor" else: #For users if not self.getUsername( credDict ): self.__authLogger.warn( "User is invalid or does not belong to the group it's saying" ) if not allowAll: return False #If all, then set anon credentials credDict[ self.KW_USERNAME ] = "anonymous" credDict[ self.KW_GROUP ] = "visitor" #If any or all in the props, allow allowGroup = not validGroups or credDict[ self.KW_GROUP ] in validGroups if allowAll and allowGroup: return True #Check authorized groups if "authenticated" in lowerCaseProperties and allowGroup: return True if not self.matchProperties( credDict, requiredProperties ): self.__authLogger.warn( "Client is not authorized\nValid properties: %s\nClient: %s" % ( requiredProperties, credDict ) ) return False elif not allowGroup: self.__authLogger.warn( "Client is not authorized\nValid groups: %s\nClient: %s" % ( validGroups, credDict ) ) return False return True def getHostNickName( self, credDict ): """ Discover the host nickname associated to the DN. The nickname will be included in the credentials dictionary. :type credDict: dictionary :param credDict: Credentials to ckeck :return: Boolean specifying whether the nickname was found """ if self.KW_DN not in credDict: return True if self.KW_GROUP not in credDict: return False retVal = CS.getHostnameForDN( credDict[ self.KW_DN ] ) if not retVal[ 'OK' ]: gLogger.warn( "Cannot find hostname for DN %s: %s" % ( credDict[ self.KW_DN ], retVal[ 'Message' ] ) ) return False credDict[ self.KW_USERNAME ] = retVal[ 'Value' ] credDict[ self.KW_PROPERTIES ] = CS.getPropertiesForHost( credDict[ self.KW_USERNAME ], [] ) return True def getValidPropertiesForMethod( self, method, defaultProperties = False ): """ Get all authorized groups for calling a method :type method: string :param method: Method to test :return: List containing the allowed groups """ authProps = gConfig.getValue( "%s/%s" % ( self.authSection, method ), [] ) if authProps: return authProps if defaultProperties: self.__authLogger.verbose( "Using hardcoded properties for method %s : %s" % ( method, defaultProperties ) ) if type( defaultProperties ) not in ( types.ListType, types.TupleType ): return List.fromChar( defaultProperties ) return defaultProperties defaultPath = "%s/Default" % "/".join( method.split( "/" )[:-1] ) authProps = gConfig.getValue( "%s/%s" % ( self.authSection, defaultPath ), [] ) if authProps: self.__authLogger.verbose( "Method %s has no properties defined using %s" % ( method, defaultPath ) ) return authProps self.__authLogger.verbose( "Method %s has no authorization rules defined. Allowing no properties" % method ) return [] def getValidGroups( self, rawProperties ): """ Get valid groups as specified in the method authorization rules :param list rawProperties: all method properties :return: list of allowed groups or [] """ validGroups = [] for prop in list( rawProperties ): if prop.startswith( 'group:' ): rawProperties.remove( prop ) prop = prop.replace( 'group:', '' ) validGroups.append( prop ) elif prop.startswith( 'vo:' ): rawProperties.remove( prop ) vo = prop.replace( 'vo:', '' ) result = getGroupsForVO( vo ) if result['OK']: validGroups.extend( result['Value'] ) validGroups = list( set( validGroups ) ) return validGroups def forwardedCredentials( self, credDict ): """ Check whether the credentials are being forwarded by a valid source :type credDict: dictionary :param credDict: Credentials to ckeck :return: Boolean with the result """ if self.KW_EXTRA_CREDENTIALS in credDict and type( credDict[ self.KW_EXTRA_CREDENTIALS ] ) == types.TupleType: if self.KW_DN in credDict: retVal = CS.getHostnameForDN( credDict[ self.KW_DN ] ) if retVal[ 'OK' ]: hostname = retVal[ 'Value' ] if Properties.TRUSTED_HOST in CS.getPropertiesForHost( hostname, [] ): return True return False def unpackForwardedCredentials( self, credDict ): """ Extract the forwarded credentials :type credDict: dictionary :param credDict: Credentials to unpack """ credDict[ self.KW_DN ] = credDict[ self.KW_EXTRA_CREDENTIALS ][0] credDict[ self.KW_GROUP ] = credDict[ self.KW_EXTRA_CREDENTIALS ][1] del( credDict[ self.KW_EXTRA_CREDENTIALS ] ) def getUsername( self, credDict ): """ Discover the username associated to the DN. It will check if the selected group is valid. The username will be included in the credentials dictionary. :type credDict: dictionary :param credDict: Credentials to ckeck :return: Boolean specifying whether the username was found """ if self.KW_DN not in credDict: return True if self.KW_GROUP not in credDict: result = CS.findDefaultGroupForDN( credDict[ self.KW_DN ] ) if not result['OK']: return False credDict[ self.KW_GROUP ] = result['Value'] credDict[ self.KW_PROPERTIES ] = CS.getPropertiesForGroup( credDict[ self.KW_GROUP ], [] ) usersInGroup = CS.getUsersInGroup( credDict[ self.KW_GROUP ], [] ) if not usersInGroup: return False retVal = CS.getUsernameForDN( credDict[ self.KW_DN ], usersInGroup ) if retVal[ 'OK' ]: credDict[ self.KW_USERNAME ] = retVal[ 'Value' ] return True return False def matchProperties( self, credDict, validProps, caseSensitive = False ): """ Return True if one or more properties are in the valid list of properties :type props: list :param props: List of properties to match :type validProps: list :param validProps: List of valid properties :return: Boolean specifying whether any property has matched the valid ones """ #HACK: Map lower case properties to properties to make the check in lowercase but return the proper case if not caseSensitive: validProps = dict( ( prop.lower(), prop ) for prop in validProps ) else: validProps = dict( ( prop, prop ) for prop in validProps ) groupProperties = credDict[ self.KW_PROPERTIES ] foundProps = [] for prop in groupProperties: if not caseSensitive: prop = prop.lower() if prop in validProps: foundProps.append( validProps[ prop ] ) credDict[ self.KW_PROPERTIES ] = foundProps return foundProps
def getFTS3Context(self, username, group, ftsServer, threadID): """ Returns an fts3 context for a given user, group and fts server The context pool is per thread, and there is one context per tuple (user, group, server). We dump the proxy of a user to a file (shared by all the threads), and use it to make the context. The proxy needs a lifetime of self.proxyLifetime, is cached for cacheTime = (2*lifeTime/3) - 10mn, and the lifetime of the context is 45mn The reason for cacheTime to be what it is is because the FTS3 server will ask for a new proxy after 2/3rd of the existing proxy has expired, so we renew it just before :param str username: name of the user :param str group: group of the user :param str ftsServer: address of the server :param str threadID: thread ID :returns: S_OK with the context object """ log = gLogger.getSubLogger("getFTS3Context", child=True) contextes = self._globalContextCache.setdefault(threadID, DictCache()) idTuple = (username, group, ftsServer) log.debug("Getting context for %s" % (idTuple, )) # We keep a context in the cache for 45 minutes # (so it needs to be valid at least 15 since we add it for one hour) if not contextes.exists(idTuple, 15 * 60): res = getDNForUsername(username) if not res['OK']: return res # We take the first DN returned userDN = res['Value'][0] log.debug("UserDN %s" % userDN) # We dump the proxy to a file. # It has to have a lifetime of self.proxyLifetime # Because the FTS3 servers cache it for 2/3rd of the lifetime # we should make our cache a bit less than 2/3rd of the lifetime cacheTime = int(2 * self.proxyLifetime / 3) - 600 res = gProxyManager.downloadVOMSProxyToFile( userDN, group, requiredTimeLeft=self.proxyLifetime, cacheTime=cacheTime) if not res['OK']: return res proxyFile = res['Value'] log.debug("Proxy file %s" % proxyFile) # We generate the context # In practice, the lifetime will be less than proxyLifetime # because we reuse a cached proxy. However, the cached proxy will # never forced a redelegation, because it is recent enough for FTS3 servers. # The delegation is forced when 2/3 rd of the lifetime are left, and we get a fresh # one just before. So no problem res = FTS3Job.generateContext(ftsServer, proxyFile, lifetime=self.proxyLifetime) if not res['OK']: return res context = res['Value'] # we add it to the cache for this thread for 1h contextes.add(idTuple, 3600, context) return S_OK(contextes.get(idTuple))
def __init__( self, requestString, requestName, executionOrder, jobID, configPath ): """ c'tor :param self: self reference :param str requestString: XML serialised RequestContainer :param str requestName: request name :param list executionOrder: request execution order :param int jobID: jobID :param str sourceServer: request's source server :param str configPath: path in CS for parent agent """ ## fixtures ## python fixtures import os, os.path, sys, time, re, types self.makeGlobal( "os", os ) self.makeGlobal( "os.path", os.path ) self.makeGlobal( "sys", sys ) self.makeGlobal( "time", time ) self.makeGlobal( "re", re ) ## export all Types from types [ self.makeGlobal( item, getattr( types, item ) ) for item in dir(types) if "Type" in item ] ## DIRAC fixtures from DIRAC.FrameworkSystem.Client.Logger import gLogger self.__log = gLogger.getSubLogger( "%s/%s" % ( self.__class__.__name__, str(requestName) ) ) self.always = self.__log.always self.notice = self.__log.notice self.info = self.__log.info self.debug = self.__log.debug self.warn = self.__log.warn self.error = self.__log.error self.exception = self.__log.exception self.fatal = self.__log.fatal from DIRAC import S_OK, S_ERROR from DIRAC.ConfigurationSystem.Client.Config import gConfig from DIRAC.FrameworkSystem.Client.ProxyManagerClient import gProxyManager from DIRAC.ConfigurationSystem.Client.Helpers.Registry import getGroupsWithVOMSAttribute from DIRAC.ConfigurationSystem.Client.ConfigurationData import gConfigurationData ## export DIRAC global tools and functions self.makeGlobal( "S_OK", S_OK ) self.makeGlobal( "S_ERROR", S_ERROR ) self.makeGlobal( "gLogger", gLogger ) self.makeGlobal( "gConfig", gConfig ) self.makeGlobal( "gProxyManager", gProxyManager ) self.makeGlobal( "getGroupsWithVOMSAttribute", getGroupsWithVOMSAttribute ) self.makeGlobal( "gConfigurationData", gConfigurationData ) ## save request string self.requestString = requestString ## build request object from DIRAC.RequestManagementSystem.Client.RequestContainer import RequestContainer self.requestObj = RequestContainer( init = False ) self.requestObj.parseRequest( request = self.requestString ) ## save request name self.requestName = requestName ## .. and jobID self.jobID = jobID ## .. and execution order self.executionOrder = executionOrder ## save config path self.__configPath = configPath ## set requestType self.setRequestType( gConfig.getValue( os.path.join( configPath, "RequestType" ), "" ) ) ## get log level self.__log.setLevel( gConfig.getValue( os.path.join( configPath, self.__class__.__name__, "LogLevel" ), "INFO" ) ) ## clear monitoring self.__monitor = {} ## save DataManager proxy if "X509_USER_PROXY" in os.environ: self.info("saving path to current proxy file") self.__dataManagerProxy = os.environ["X509_USER_PROXY"] else: self.error("'X509_USER_PROXY' environment variable not set")
except ImportError: LockRing = None try: from DIRAC.Core.Utilities.ReturnValues import S_OK, S_ERROR except ImportError: def S_OK(val=""): """ dummy S_OK """ return {'OK': True, 'Value': val} def S_ERROR(mess): """ dummy S_ERROR """ return {'OK': False, 'Message': mess} LOG = gLogger.getSubLogger(__name__) class WorkingProcess(multiprocessing.Process): """ .. class:: WorkingProcess WorkingProcess is a class that represents activity that runs in a separate process. It is running main thread (process) in daemon mode, reading tasks from :pendingQueue:, executing them and pushing back tasks with results to the :resultsQueue:. If task has got a timeout value defined a separate threading.Timer thread is started killing execution (and destroying worker) after :ProcessTask.__timeOut: seconds. Main execution could also terminate in a few different ways: * on every failed read attempt (from empty :pendingQueue:), the idle loop counter is increased,
def _constructTransferJob(self, pinTime, allLFNs, target_spacetoken, protocols=None): """ Build a job for transfer Some attributes of the job are expected to be set * sourceSE * targetSE * activity (optional) * priority (optional) * filesToSubmit * operationID (optional, used as metadata for the job) :param pinTime: pining time in case staging is needed :param allLFNs: list of LFNs to transfer :param failedLFNs: set of LFNs in filesToSubmit for which there was a problem :param target_spacetoken: the space token of the target :param protocols: list of protocols to restrict the protocol choice for the transfer :return: S_OK( (job object, list of ftsFileIDs in the job)) """ log = gLogger.getSubLogger( "constructTransferJob/%s/%s_%s" % (self.operationID, self.sourceSE, self.targetSE), True) res = self.__fetchSpaceToken(self.sourceSE) if not res['OK']: return res source_spacetoken = res['Value'] failedLFNs = set() dstSE = StorageElement(self.targetSE, vo=self.vo) srcSE = StorageElement(self.sourceSE, vo=self.vo) # getting all the (source, dest) surls res = dstSE.generateTransferURLsBetweenSEs(allLFNs, srcSE, protocols=protocols) if not res['OK']: return res for lfn, reason in res['Value']['Failed'].iteritems(): failedLFNs.add(lfn) log.error("Could not get source SURL", "%s %s" % (lfn, reason)) allSrcDstSURLs = res['Value']['Successful'] transfers = [] fileIDsInTheJob = [] for ftsFile in self.filesToSubmit: if ftsFile.lfn in failedLFNs: log.debug("Not preparing transfer for file %s" % ftsFile.lfn) continue sourceSURL, targetSURL = allSrcDstSURLs[ftsFile.lfn] if sourceSURL == targetSURL: log.error("sourceSURL equals to targetSURL", "%s" % ftsFile.lfn) ftsFile.error = "sourceSURL equals to targetSURL" ftsFile.status = 'Defunct' continue trans = fts3.new_transfer(sourceSURL, targetSURL, checksum='ADLER32:%s' % ftsFile.checksum, filesize=ftsFile.size, metadata=getattr(ftsFile, 'fileID'), activity=self.activity) transfers.append(trans) fileIDsInTheJob.append(getattr(ftsFile, 'fileID')) # If the source is not an tape SE, we should set the # copy_pin_lifetime and bring_online params to None, # otherwise they will do an extra useless queue in FTS sourceIsTape = self.__isTapeSE(self.sourceSE) copy_pin_lifetime = pinTime if sourceIsTape else None bring_online = BRING_ONLINE_TIMEOUT if sourceIsTape else None if not transfers: log.error("No transfer possible!") return S_ERROR("No transfer possible") # We add a few metadata to the fts job so that we can reuse them later on without # querying our DB. # source and target SE are just used for accounting purpose job_metadata = { 'operationID': self.operationID, 'sourceSE': self.sourceSE, 'targetSE': self.targetSE} job = fts3.new_job(transfers=transfers, overwrite=True, source_spacetoken=source_spacetoken, spacetoken=target_spacetoken, bring_online=bring_online, copy_pin_lifetime=copy_pin_lifetime, retry=3, metadata=job_metadata, priority=self.priority) return S_OK((job, fileIDsInTheJob))
def _constructTransferJob(self, pinTime, allLFNs, target_spacetoken, protocols=None): """ Build a job for transfer Some attributes of the job are expected to be set * sourceSE * targetSE * activity (optional) * priority (optional) * filesToSubmit * operationID (optional, used as metadata for the job) :param pinTime: pining time in case staging is needed :param allLFNs: list of LFNs to transfer :param failedLFNs: set of LFNs in filesToSubmit for which there was a problem :param target_spacetoken: the space token of the target :param protocols: list of protocols to restrict the protocol choice for the transfer :return: S_OK( (job object, list of ftsFileIDs in the job)) """ log = gLogger.getSubLogger( "constructTransferJob/%s/%s_%s" % (self.operationID, self.sourceSE, self.targetSE), True) res = self.__fetchSpaceToken(self.sourceSE, self.vo) if not res['OK']: return res source_spacetoken = res['Value'] failedLFNs = set() dstSE = StorageElement(self.targetSE, vo=self.vo) srcSE = StorageElement(self.sourceSE, vo=self.vo) # If the source is not a tape SE, we should set the # copy_pin_lifetime and bring_online params to None, # otherwise they will do an extra useless queue in FTS sourceIsTape = self.__isTapeSE(self.sourceSE, self.vo) copy_pin_lifetime = pinTime if sourceIsTape else None bring_online = BRING_ONLINE_TIMEOUT if sourceIsTape else None # getting all the (source, dest) surls res = dstSE.generateTransferURLsBetweenSEs(allLFNs, srcSE, protocols=protocols) if not res['OK']: return res for lfn, reason in res['Value']['Failed'].items(): failedLFNs.add(lfn) log.error("Could not get source SURL", "%s %s" % (lfn, reason)) allSrcDstSURLs = res['Value']['Successful'] # This contains the staging URLs if they are different from the transfer URLs # (CTA...) allStageURLs = dict() # In case we are transfering from a tape system, and the stage protocol # is not the same as the transfer protocol, we generate the staging URLs # to do a multihop transfer. See below. if sourceIsTape: srcProto, _destProto = res['Value']['Protocols'] if srcProto not in srcSE.localStageProtocolList: # As of version 3.10, FTS can only handle one file per multi hop # job. If we are here, that means that we need one, so make sure that # we only have a single file to transfer (this should have been checked # at the job construction step in FTS3Operation). # This test is important, because multiple files would result in the source # being deleted ! if len(allLFNs) != 1: log.debug( "Multihop job has %s files while only 1 allowed" % len(allLFNs)) return S_ERROR( errno.E2BIG, "Trying multihop job with more than one file !") res = srcSE.getURL(allSrcDstSURLs, protocol=srcSE.localStageProtocolList) if not res['OK']: return res for lfn, reason in res['Value']['Failed'].items(): failedLFNs.add(lfn) log.error("Could not get stage SURL", "%s %s" % (lfn, reason)) allSrcDstSURLs.pop(lfn) allStageURLs = res['Value']['Successful'] transfers = [] fileIDsInTheJob = [] for ftsFile in self.filesToSubmit: if ftsFile.lfn in failedLFNs: log.debug("Not preparing transfer for file %s" % ftsFile.lfn) continue sourceSURL, targetSURL = allSrcDstSURLs[ftsFile.lfn] stageURL = allStageURLs.get(ftsFile.lfn) if sourceSURL == targetSURL: log.error("sourceSURL equals to targetSURL", "%s" % ftsFile.lfn) ftsFile.error = "sourceSURL equals to targetSURL" ftsFile.status = 'Defunct' continue ftsFileID = getattr(ftsFile, 'fileID') # Under normal circumstances, we simply submit an fts transfer as such: # * srcProto://myFile -> destProto://myFile # # Even in case of the source storage being a tape system, it works fine. # However, if the staging and transfer protocols are different (which might be the case for CTA), # we use the multihop machinery to submit two sequential fts transfers: # one to stage, one to transfer. # It looks like such # * stageProto://myFile -> stageProto://myFile # * srcProto://myFile -> destProto://myFile if stageURL: # We do not set a fileID in the metadata # such that we do not update the DB when monitoring stageTrans_metadata = {'desc': 'PreStage %s' % ftsFileID} stageTrans = fts3.new_transfer(stageURL, stageURL, checksum='ADLER32:%s' % ftsFile.checksum, filesize=ftsFile.size, metadata=stageTrans_metadata, activity=self.activity) transfers.append(stageTrans) trans_metadata = { 'desc': 'Transfer %s' % ftsFileID, 'fileID': ftsFileID } trans = fts3.new_transfer(sourceSURL, targetSURL, checksum='ADLER32:%s' % ftsFile.checksum, filesize=ftsFile.size, metadata=trans_metadata, activity=self.activity) transfers.append(trans) fileIDsInTheJob.append(ftsFileID) if not transfers: log.error("No transfer possible!") return S_ERROR("No transfer possible") # We add a few metadata to the fts job so that we can reuse them later on without # querying our DB. # source and target SE are just used for accounting purpose job_metadata = { 'operationID': self.operationID, 'rmsReqID': self.rmsReqID, 'sourceSE': self.sourceSE, 'targetSE': self.targetSE } job = fts3.new_job( transfers=transfers, overwrite=True, source_spacetoken=source_spacetoken, spacetoken=target_spacetoken, bring_online=bring_online, copy_pin_lifetime=copy_pin_lifetime, retry=3, verify_checksum= 'target', # Only check target vs specified, since we verify the source earlier multihop=bool( allStageURLs), # if we have stage urls, then we need multihop metadata=job_metadata, priority=self.priority) return S_OK((job, fileIDsInTheJob))
def _constructTransferJob(self, pinTime, allLFNs, target_spacetoken, protocols=None): """Build a job for transfer Some attributes of the job are expected to be set * sourceSE * targetSE * multiHopSE (optional) * activity (optional) * priority (optional) * filesToSubmit * operationID (optional, used as metadata for the job) Note that, because of FTS limitations (and also because it anyway would be "not very smart"), multiHop can only use non-SRM disk storage as hops. :param pinTime: pining time in case staging is needed :param allLFNs: list of LFNs to transfer :param failedLFNs: set of LFNs in filesToSubmit for which there was a problem :param target_spacetoken: the space token of the target :param protocols: list of protocols to restrict the protocol choice for the transfer :return: S_OK( (job object, list of ftsFileIDs in the job)) """ log = gLogger.getSubLogger(f"constructTransferJob/{self.operationID}/{self.sourceSE}_{self.targetSE}") isMultiHop = False # Check if it is a multiHop transfer if self.multiHopSE: if len(allLFNs) != 1: log.debug("Multihop job has %s files while only 1 allowed" % len(allLFNs)) return S_ERROR(errno.E2BIG, "Trying multihop job with more than one file !") allHops = [(self.sourceSE, self.multiHopSE), (self.multiHopSE, self.targetSE)] isMultiHop = True else: allHops = [(self.sourceSE, self.targetSE)] nbOfHops = len(allHops) res = self.__fetchSpaceToken(self.sourceSE, self.vo) if not res["OK"]: return res source_spacetoken = res["Value"] failedLFNs = set() copy_pin_lifetime = None bring_online = None archive_timeout = None transfers = [] fileIDsInTheJob = set() for hopId, (hopSrcSEName, hopDstSEName) in enumerate(allHops, start=1): # Again, this is relevant only for the very initial source # but code factorization is more important hopSrcIsTape = self.__isTapeSE(hopSrcSEName, self.vo) dstSE = StorageElement(hopDstSEName, vo=self.vo) srcSE = StorageElement(hopSrcSEName, vo=self.vo) # getting all the (source, dest) surls res = dstSE.generateTransferURLsBetweenSEs(allLFNs, srcSE, protocols=protocols) if not res["OK"]: return res for lfn, reason in res["Value"]["Failed"].items(): failedLFNs.add(lfn) log.error("Could not get source SURL", "%s %s" % (lfn, reason)) allSrcDstSURLs = res["Value"]["Successful"] srcProto, destProto = res["Value"]["Protocols"] # If the source is a tape SE, we should set the # copy_pin_lifetime and bring_online params # In case of multihop, this is relevant only for the # original source, but again, code factorization is more important if hopSrcIsTape: copy_pin_lifetime = pinTime bring_online = srcSE.options.get("BringOnlineTimeout", BRING_ONLINE_TIMEOUT) # If the destination is a tape, and the protocol supports it, # check if we want to have an archive timeout # In case of multihop, this is relevant only for the # final target, but again, code factorization is more important dstIsTape = self.__isTapeSE(hopDstSEName, self.vo) if dstIsTape and destProto in dstSE.localStageProtocolList: archive_timeout = dstSE.options.get("ArchiveTimeout") # This contains the staging URLs if they are different from the transfer URLs # (CTA...) allStageURLs = dict() # In case we are transfering from a tape system, and the stage protocol # is not the same as the transfer protocol, we generate the staging URLs # to do a multihop transfer. See below. if hopSrcIsTape and srcProto not in srcSE.localStageProtocolList: isMultiHop = True # As of version 3.10, FTS can only handle one file per multi hop # job. If we are here, that means that we need one, so make sure that # we only have a single file to transfer (this should have been checked # at the job construction step in FTS3Operation). # This test is important, because multiple files would result in the source # being deleted ! if len(allLFNs) != 1: log.debug("Multihop job has %s files while only 1 allowed" % len(allLFNs)) return S_ERROR(errno.E2BIG, "Trying multihop job with more than one file !") res = srcSE.getURL(allSrcDstSURLs, protocol=srcSE.localStageProtocolList) if not res["OK"]: return res for lfn, reason in res["Value"]["Failed"].items(): failedLFNs.add(lfn) log.error("Could not get stage SURL", "%s %s" % (lfn, reason)) allSrcDstSURLs.pop(lfn) allStageURLs = res["Value"]["Successful"] for ftsFile in self.filesToSubmit: if ftsFile.lfn in failedLFNs: log.debug("Not preparing transfer for file %s" % ftsFile.lfn) continue sourceSURL, targetSURL = allSrcDstSURLs[ftsFile.lfn] stageURL = allStageURLs.get(ftsFile.lfn) if sourceSURL == targetSURL: log.error("sourceSURL equals to targetSURL", "%s" % ftsFile.lfn) ftsFile.error = "sourceSURL equals to targetSURL" ftsFile.status = "Defunct" continue ftsFileID = getattr(ftsFile, "fileID") # Under normal circumstances, we simply submit an fts transfer as such: # * srcProto://myFile -> destProto://myFile # # Even in case of the source storage being a tape system, it works fine. # However, if the staging and transfer protocols are different (which might be the case for CTA), # we use the multihop machinery to submit two sequential fts transfers: # one to stage, one to transfer. # It looks like such # * stageProto://myFile -> stageProto://myFile # * srcProto://myFile -> destProto://myFile if stageURL: # We do not set a fileID in the metadata # such that we do not update the DB when monitoring stageTrans_metadata = {"desc": "PreStage %s" % ftsFileID} # If we use an activity, also set it as file metadata # for WLCG monitoring purposes # https://its.cern.ch/jira/projects/DOMATPC/issues/DOMATPC-14? if self.activity: stageTrans_metadata["activity"] = self.activity stageTrans = fts3.new_transfer( stageURL, stageURL, checksum="ADLER32:%s" % ftsFile.checksum, filesize=ftsFile.size, metadata=stageTrans_metadata, activity=self.activity, ) transfers.append(stageTrans) # If it is the last hop only, we set the fileID metadata # for monitoring if hopId == nbOfHops: trans_metadata = {"desc": "Transfer %s" % ftsFileID, "fileID": ftsFileID} else: trans_metadata = {"desc": "MultiHop %s" % ftsFileID} # If we use an activity, also set it as file metadata # for WLCG monitoring purposes # https://its.cern.ch/jira/projects/DOMATPC/issues/DOMATPC-14? if self.activity: trans_metadata["activity"] = self.activity # because of an xroot bug (https://github.com/xrootd/xrootd/issues/1433) # the checksum needs to be lowercase. It does not impact the other # protocol, so it's fine to put it here. # I only add it in this transfer and not the "staging" one above because it # impacts only root -> root transfers trans = fts3.new_transfer( sourceSURL, targetSURL, checksum="ADLER32:%s" % ftsFile.checksum.lower(), filesize=ftsFile.size, metadata=trans_metadata, activity=self.activity, ) transfers.append(trans) fileIDsInTheJob.add(ftsFileID) if not transfers: log.error("No transfer possible!") return S_ERROR(errno.ENODATA, "No transfer possible") # We add a few metadata to the fts job so that we can reuse them later on without # querying our DB. # source and target SE are just used for accounting purpose job_metadata = { "operationID": self.operationID, "rmsReqID": self.rmsReqID, "sourceSE": self.sourceSE, "targetSE": self.targetSE, } if self.activity: job_metadata["activity"] = self.activity job = fts3.new_job( transfers=transfers, overwrite=True, source_spacetoken=source_spacetoken, spacetoken=target_spacetoken, bring_online=bring_online, copy_pin_lifetime=copy_pin_lifetime, retry=3, verify_checksum="target", # Only check target vs specified, since we verify the source earlier multihop=isMultiHop, metadata=job_metadata, priority=self.priority, archive_timeout=archive_timeout, ) return S_OK((job, fileIDsInTheJob))
def _constructStagingJob(self, pinTime, allLFNs, target_spacetoken): """ Build a job for staging Some attributes of the job are expected to be set * targetSE * activity (optional) * priority (optional) * filesToSubmit * operationID (optional, used as metadata for the job) :param pinTime: pining time in case staging is needed :param allLFNs: List of LFNs to stage :param failedLFNs: set of LFNs in filesToSubmit for which there was a problem :param target_spacetoken: the space token of the target :return: S_OK( (job object, list of ftsFileIDs in the job)) """ log = gLogger.getSubLogger( "constructStagingJob/%s/%s" % (self.operationID, self.targetSE), True) transfers = [] fileIDsInTheJob = [] # Set of LFNs for which we did not get an SRM URL failedLFNs = set() # getting all the target surls res = StorageElement(self.targetSE, vo=self.vo).getURL(allLFNs, protocol='srm') if not res['OK']: return res for lfn, reason in res['Value']['Failed'].iteritems(): failedLFNs.add(lfn) log.error("Could not get target SURL", "%s %s" % (lfn, reason)) allTargetSURLs = res['Value']['Successful'] for ftsFile in self.filesToSubmit: if ftsFile.lfn in failedLFNs: log.debug("Not preparing transfer for file %s" % ftsFile.lfn) continue sourceSURL = targetSURL = allTargetSURLs[ftsFile.lfn] trans = fts3.new_transfer(sourceSURL, targetSURL, checksum='ADLER32:%s' % ftsFile.checksum, filesize=ftsFile.size, metadata=getattr(ftsFile, 'fileID'), activity=self.activity) transfers.append(trans) fileIDsInTheJob.append(getattr(ftsFile, 'fileID')) # If the source is not an tape SE, we should set the # copy_pin_lifetime and bring_online params to None, # otherwise they will do an extra useless queue in FTS sourceIsTape = self.__isTapeSE(self.sourceSE) copy_pin_lifetime = pinTime if sourceIsTape else None bring_online = 86400 if sourceIsTape else None # We add a few metadata to the fts job so that we can reuse them later on without # querying our DB. # source and target SE are just used for accounting purpose job_metadata = { 'operationID': self.operationID, 'sourceSE': self.sourceSE, 'targetSE': self.targetSE } job = fts3.new_job(transfers=transfers, overwrite=True, source_spacetoken=target_spacetoken, spacetoken=target_spacetoken, bring_online=bring_online, copy_pin_lifetime=copy_pin_lifetime, retry=3, metadata=job_metadata, priority=self.priority) return S_OK((job, fileIDsInTheJob))
def submit(self, context=None, ftsServer=None, ucert=None, pinTime=36000, ): """ submit the job to the FTS server Some attributes are expected to be defined for the submission to work: * type (set by FTS3Operation) * sourceSE (only for Transfer jobs) * targetSE * activity (optional) * priority (optional) * username * userGroup * filesToSubmit * operationID (optional, used as metadata for the job) We also expect the FTSFiles have an ID defined, as it is given as transfer metadata :param pinTime: Time the file should be pinned on disk (used for transfers and staging) Used only if he source SE is a tape storage :param context: fts3 context. If not given, it is created (see ftsServer & ucert param) :param ftsServer: the address of the fts server to submit to. Used only if context is not given. if not given either, use the ftsServer object attribute :param ucert: path to the user certificate/proxy. Might be inferred by the fts cli (see its doc) :returns S_OK([FTSFiles ids of files submitted]) """ log = gLogger.getSubLogger("submit/%s/%s_%s" % (self.operationID, self.sourceSE, self.targetSE), True) if not context: if not ftsServer: ftsServer = self.ftsServer context = fts3.Context( endpoint=ftsServer, ucert=ucert, request_class=ftsSSLRequest, verify=False) # Construct the target SURL res = self.__fetchSpaceToken(self.targetSE) if not res['OK']: return res target_spacetoken = res['Value'] allLFNs = [ftsFile.lfn for ftsFile in self.filesToSubmit] failedLFNs = set() # getting all the target surls res = StorageElement(self.targetSE, vo=self.vo).getURL(allLFNs, protocol='srm') if not res['OK']: return res for lfn, reason in res['Value']['Failed'].iteritems(): failedLFNs.add(lfn) log.error("Could not get target SURL", "%s %s" % (lfn, reason)) allTargetSURLs = res['Value']['Successful'] if self.type == 'Transfer': res = self._constructTransferJob( context, pinTime, allTargetSURLs, failedLFNs, target_spacetoken) elif self.type == 'Staging': res = self._constructStagingJob( context, pinTime, allTargetSURLs, failedLFNs, target_spacetoken) elif self.type == 'Removal': res = self._constructRemovalJob(context, allTargetSURLs, failedLFNs, target_spacetoken) if not res['OK']: return res job, fileIDsInTheJob = res['Value'] setFileIdsInTheJob = set(fileIDsInTheJob) try: self.ftsGUID = fts3.submit(context, job) log.info("Got GUID %s" % self.ftsGUID) # Only increase the amount of attempt # if we succeeded in submitting -> no ! Why did I do that ?? for ftsFile in self.filesToSubmit: ftsFile.attempt += 1 if ftsFile.fileID in setFileIdsInTheJob: ftsFile.status = 'Submitted' now = datetime.datetime.utcnow().replace(microsecond=0) self.submitTime = now self.lastUpdate = now self.lastMonitor = now except FTS3ClientException as e: log.exception("Error at submission", repr(e)) return S_ERROR("Error at submission: %s" % e) return S_OK(fileIDsInTheJob)
def _monitorJob(self, ftsJob): """ * query the FTS servers * update the FTSFile status * update the FTSJob status :param ftsJob: FTS job :return: ftsJob, S_OK()/S_ERROR() """ # General try catch to avoid that the tread dies try: threadID = current_process().name log = gLogger.getSubLogger("_monitorJob/%s" % ftsJob.jobID, child=True) res = self.getFTS3Context(ftsJob.username, ftsJob.userGroup, ftsJob.ftsServer, threadID=threadID) if not res['OK']: log.error("Error getting context", res) return ftsJob, res context = res['Value'] res = ftsJob.monitor(context=context) if not res['OK']: log.error("Error monitoring job", res) # If the job was not found on the server, update the DB if cmpError(res, errno.ESRCH): res = self.fts3db.cancelNonExistingJob( ftsJob.operationID, ftsJob.ftsGUID) return ftsJob, res # { fileID : { Status, Error } } filesStatus = res['Value'] # Specify the job ftsGUID to make sure we do not overwrite # status of files already taken by newer jobs res = self.fts3db.updateFileStatus(filesStatus, ftsGUID=ftsJob.ftsGUID) if not res['OK']: log.error("Error updating file fts status", "%s, %s" % (ftsJob.ftsGUID, res)) return ftsJob, res upDict = { ftsJob.jobID: { 'status': ftsJob.status, 'error': ftsJob.error, 'completeness': ftsJob.completeness, 'operationID': ftsJob.operationID, 'lastMonitor': True, } } res = self.fts3db.updateJobStatus(upDict) if ftsJob.status in ftsJob.FINAL_STATES: self.__sendAccounting(ftsJob) return ftsJob, res except Exception as e: return ftsJob, S_ERROR(0, "Exception %s" % repr(e))
def _treatOperation(self, operation): """ Treat one operation: * does the callback if the operation is finished * generate new jobs and submits them :param operation: the operation to treat :param threadId: the id of the tread, it just has to be unique (used for the context cache) """ try: threadID = current_process().name log = gLogger.getSubLogger("treatOperation/%s" % operation.operationID, child=True) # If the operation is totally processed # we perform the callback if operation.isTotallyProcessed(): log.debug("FTS3Operation %s is totally processed" % operation.operationID) res = operation.callback() if not res['OK']: log.error("Error performing the callback", res) log.info("Putting back the operation") dbRes = self.fts3db.persistOperation(operation) if not dbRes['OK']: log.error("Could not persist operation", dbRes) return operation, res else: log.debug("FTS3Operation %s is not totally processed yet" % operation.operationID) res = operation.prepareNewJobs( maxFilesPerJob=self.maxFilesPerJob, maxAttemptsPerFile=self.maxAttemptsPerFile) if not res['OK']: log.error("Cannot prepare new Jobs", "FTS3Operation %s : %s" % (operation.operationID, res)) return operation, res newJobs = res['Value'] log.debug("FTS3Operation %s: %s new jobs to be submitted" % (operation.operationID, len(newJobs))) for ftsJob in newJobs: res = self._serverPolicy.chooseFTS3Server() if not res['OK']: log.error(res) continue ftsServer = res['Value'] log.debug("Use %s server" % ftsServer) ftsJob.ftsServer = ftsServer res = self.getFTS3Context( ftsJob.username, ftsJob.userGroup, ftsServer, threadID=threadID) if not res['OK']: log.error("Could not get context", res) continue context = res['Value'] res = ftsJob.submit(context=context, protocols=self.thirdPartyProtocols) if not res['OK']: log.error("Could not submit FTS3Job", "FTS3Operation %s : %s" % (operation.operationID, res)) continue operation.ftsJobs.append(ftsJob) submittedFileIds = res['Value'] log.info("FTS3Operation %s: Submitted job for %s transfers" % (operation.operationID, len(submittedFileIds))) # new jobs are put in the DB at the same time res = self.fts3db.persistOperation(operation) if not res['OK']: log.error("Could not persist operation", res) return operation, res except Exception as e: log.exception('Exception in the thread', repr(e)) return operation, S_ERROR("Exception %s" % repr(e))
def _treatOperation(self, operation): """ Treat one operation: * does the callback if the operation is finished * generate new jobs and submits them :param operation: the operation to treat :param threadId: the id of the tread, it just has to be unique (used for the context cache) """ try: threadID = current_process().name log = gLogger.getSubLogger("treatOperation/%s" % operation.operationID, child=True) # If the operation is totally processed # we perform the callback if operation.isTotallyProcessed(): log.debug("FTS3Operation %s is totally processed" % operation.operationID) res = operation.callback() if not res['OK']: log.error("Error performing the callback", res) log.info("Putting back the operation") dbRes = self.fts3db.persistOperation(operation) if not dbRes['OK']: log.error("Could not persist operation", dbRes) return operation, res else: log.debug("FTS3Operation %s is not totally processed yet" % operation.operationID) res = operation.prepareNewJobs( maxFilesPerJob=self.maxFilesPerJob, maxAttemptsPerFile=self.maxAttemptsPerFile) if not res['OK']: log.error("Cannot prepare new Jobs", "FTS3Operation %s : %s" % (operation.operationID, res)) return operation, res newJobs = res['Value'] log.debug("FTS3Operation %s: %s new jobs to be submitted" % (operation.operationID, len(newJobs))) for ftsJob in newJobs: res = self._serverPolicy.chooseFTS3Server() if not res['OK']: log.error(res) continue ftsServer = res['Value'] log.debug("Use %s server" % ftsServer) ftsJob.ftsServer = ftsServer res = self.getFTS3Context( ftsJob.username, ftsJob.userGroup, ftsServer, threadID=threadID) if not res['OK']: log.error("Could not get context", res) continue context = res['Value'] res = ftsJob.submit(context=context) if not res['OK']: log.error("Could not submit FTS3Job", "FTS3Operation %s : %s" % (operation.operationID, res)) continue operation.ftsJobs.append(ftsJob) submittedFileIds = res['Value'] log.info("FTS3Operation %s: Submitted job for %s transfers" % (operation.operationID, len(submittedFileIds))) # new jobs are put in the DB at the same time res = self.fts3db.persistOperation(operation) if not res['OK']: log.error("Could not persist operation", res) return operation, res except Exception as e: log.exception('Exception in the thread', repr(e)) return operation, S_ERROR("Exception %s" % repr(e))
import os try: import hashlib md5 = hashlib except: import md5 import types import threading import cStringIO import tarfile import tempfile from DIRAC.Core.Utilities.ReturnValues import S_OK, S_ERROR from DIRAC.FrameworkSystem.Client.Logger import gLogger gLogger = gLogger.getSubLogger( "FileTransmissionHelper" ) class FileHelper: __validDirections = ( "toClient", "fromClient", 'receive', 'send' ) __directionsMapping = { 'toClient' : 'send', 'fromClient' : 'receive' } def __init__( self, oTransport = None, checkSum = True ): self.oTransport = oTransport self.__checkMD5 = checkSum self.__oMD5 = md5.md5() self.bFinishedTransmission = False self.bReceivedEOF = False self.direction = False self.packetSize = 1048576 self.__fileBytes = 0