def getReplicasPresence(self, lfns): """get the replicas using the standard FileCatalog.getReplicas()""" present = set() notPresent = set() chunkSize = 100 printProgress = len(lfns) > chunkSize startTime = time.time() self.__write( "Checking replicas for %d files%s" % (len(lfns), (" (chunks of %d)" % chunkSize) if printProgress else "... ")) for chunk in breakListIntoChunks(lfns, chunkSize): if printProgress: self.__write(".") for _ in range(1, 10): res = self.fileCatalog.getReplicas(chunk) if res["OK"]: present.update(res["Value"]["Successful"]) self.cachedReplicas.update(res["Value"]["Successful"]) notPresent.update(res["Value"]["Failed"]) break else: time.sleep(0.1) self.__write(" (%.1f seconds)\n" % (time.time() - startTime)) if notPresent: self.__logVerbose("Files without replicas:", "\n".join([""] + sorted(notPresent))) return list(present), list(notPresent)
def cleanOutputs( self, jobInfo ): """remove all job outputs""" if len(jobInfo.outputFiles) == 0: return descendants = self.__findAllDescendants( jobInfo.outputFiles ) existingOutputFiles = [ lfn for lfn, status in izip_longest(jobInfo.outputFiles, jobInfo.outputFileStatus) if status=="Exists" ] filesToDelete = existingOutputFiles + descendants if not filesToDelete: return if not self.enabled: self.log.notice( "Would have removed these files: \n +++ %s " % "\n +++ ".join(filesToDelete) ) return self.log.notice( "Remove these files: \n +++ %s " % "\n +++ ".join(filesToDelete) ) errorReasons = defaultdict(list) successfullyRemoved = 0 for lfnList in breakListIntoChunks(filesToDelete, 200): with UserProxy(proxyUserDN=self.authorDN, proxyUserGroup=self.authorGroup) as proxyResult: if not proxyResult['OK']: raise RuntimeError('Failed to get a proxy: %s' % proxyResult['Message']) result = DataManager().removeFile(lfnList) if not result['OK']: self.log.error("Failed to remove LFNs", result['Message']) raise RuntimeError("Failed to remove LFNs: %s" % result['Message']) for lfn, err in result['Value']['Failed'].items(): reason = str(err) errorReasons[reason].append(lfn) successfullyRemoved += len(result['Value']['Successful'].keys()) for reason, lfns in errorReasons.items(): self.log.error("Failed to remove %d files with error: %s" % (len(lfns), reason)) self.log.notice("Successfully removed %d files" % successfullyRemoved)
def _getJobStatusOnHost(self, jobIDList, host=None): """Get the status information for the given list of jobs""" resultDict = {} jobDict = {} for job in jobIDList: stamp = os.path.basename(urlparse(job).path) jobDict[stamp] = job stampList = list(jobDict) for jobList in breakListIntoChunks(stampList, 100): resultCommand = self.__executeHostCommand("getJobStatus", {"JobIDList": jobList}, host=host) if not resultCommand["OK"]: return resultCommand result = resultCommand["Value"] if result["Status"] != 0: return S_ERROR("Failed to get job status: %s" % result["Message"]) for stamp in result["Jobs"]: resultDict[jobDict[stamp]] = result["Jobs"][stamp] return S_OK(resultDict)
def prepareNewJobs(self, maxFilesPerJob=100, maxAttemptsPerFile=10): log = gLogger.getSubLogger("_prepareNewJobs", child=True) filesToSubmit = self._getFilesToSubmit( maxAttemptsPerFile=maxAttemptsPerFile) log.debug("%s ftsFiles to submit" % len(filesToSubmit)) newJobs = [] # {targetSE : [FTS3Files] } filesGroupedByTarget = FTS3Utilities.groupFilesByTarget(filesToSubmit) for targetSE, ftsFiles in filesGroupedByTarget.iteritems(): res = self._checkSEAccess(targetSE, 'ReadAccess', vo=self.vo) if not res['OK']: log.error(res) continue for ftsFilesChunk in breakListIntoChunks(ftsFiles, maxFilesPerJob): newJob = self._createNewJob('Staging', ftsFilesChunk, targetSE, sourceSE=targetSE) newJobs.append(newJob) return S_OK(newJobs)
def killJob(self, jobIDList): """ Kill the specified jobs """ result = self._prepareProxy() if not result['OK']: self.log.error('ARCComputingElement: failed to set up proxy', result['Message']) return result self.usercfg.ProxyPath(os.environ['X509_USER_PROXY']) jobList = list(jobIDList) if isinstance(jobIDList, six.string_types): jobList = [jobIDList] self.log.debug("Killing jobs %s" % jobIDList) jobs = [] for jobID in jobList: jobs.append(self.__getARCJob(jobID)) # JobSupervisor is able to aggregate jobs to perform bulk operations and thus minimizes the communication overhead # We still need to create chunks to avoid timeout in the case there are too many jobs to supervise for chunk in breakListIntoChunks(jobs, 100): job_supervisor = arc.JobSupervisor(self.usercfg, chunk) if not job_supervisor.Cancel(): errorString = ' - '.join(jobList).strip() return S_ERROR('Failed to kill at least one of these jobs: %s. CE(?) not reachable?' % errorString) return S_OK()
def getReplicas(self, lfns, allStatus=False): """Returns replicas for an LFN or list of LFNs""" result = {"OK": True, "Value": {"Successful": {}, "Failed": {}}} lfnChunks = breakListIntoChunks(lfns, 1000) for lfnList in lfnChunks: try: didList = [ self.__getDidsFromLfn(lfn) for lfn in lfnList if lfn ] for rep in self.client.list_replicas(didList): if rep: lfn = rep["name"] if self.convertUnicode: lfn = str(lfn) if lfn not in result["Value"]["Successful"]: result["Value"]["Successful"][lfn] = {} for rse in rep["rses"]: if self.convertUnicode: result["Value"]["Successful"][lfn][str( rse)] = str(rep["rses"][rse][0]) else: result["Value"]["Successful"][lfn][rse] = rep[ "rses"][rse][0] else: for did in didList: result["Value"]["Failed"][did["name"]] = "Error" except Exception as err: return S_ERROR(str(err)) return result
def prepareNewJobs(self, maxFilesPerJob=100, maxAttemptsPerFile=10): log = gLogger.getSubLogger("_prepareNewJobs", child=True) filesToSubmit = self._getFilesToSubmit(maxAttemptsPerFile=maxAttemptsPerFile) log.debug("%s ftsFiles to submit" % len(filesToSubmit)) newJobs = [] # {targetSE : [FTS3Files] } filesGroupedByTarget = FTS3Utilities.groupFilesByTarget(filesToSubmit) for targetSE, ftsFiles in filesGroupedByTarget.iteritems(): res = self._checkSEAccess(targetSE, 'ReadAccess', vo=self.vo) if not res['OK']: log.error(res) continue for ftsFilesChunk in breakListIntoChunks(ftsFiles, maxFilesPerJob): newJob = self._createNewJob('Staging', ftsFilesChunk, targetSE, sourceSE=targetSE) newJobs.append(newJob) return S_OK(newJobs)
def getReplicasPresence(self, lfns): """ get the replicas using the standard FileCatalog.getReplicas() """ present = set() notPresent = set() chunkSize = 100 printProgress = (len(lfns) > chunkSize) startTime = time.time() self.__write("Checking replicas for %d files%s" % (len(lfns), (' (chunks of %d)' % chunkSize) if printProgress else '... ')) for chunk in breakListIntoChunks(lfns, chunkSize): if printProgress: self.__write('.') for _ in xrange(1, 10): res = self.fileCatalog.getReplicas(chunk) if res['OK']: present.update(res['Value']['Successful']) self.cachedReplicas.update(res['Value']['Successful']) notPresent.update(res['Value']['Failed']) break else: time.sleep(0.1) self.__write(' (%.1f seconds)\n' % (time.time() - startTime)) if notPresent: self.__logVerbose("Files without replicas:", '\n'.join([''] + sorted(notPresent))) return list(present), list(notPresent)
def __getGlue2ExecutionEnvironmentInfo(host, executionEnvironments): """Find all the executionEnvironments. :param str host: BDII host to query :param list executionEnvironments: list of the execution environments to get some information from :returns: result of the ldapsearch for all executionEnvironments, Glue2 schema """ listOfValues = [] # break up to avoid argument list too long, it started failing at about 1900 entries for exeEnvs in breakListIntoChunks(executionEnvironments, 1000): exeFilter = '' for execEnv in exeEnvs: exeFilter += '(GLUE2ResourceID=%s)' % execEnv filt = "(&(objectClass=GLUE2ExecutionEnvironment)(|%s))" % exeFilter response = __ldapsearchBDII(filt=filt, attr=None, host=host, base="o=glue", selectionString="GLUE2") if not response['OK']: return response if not response['Value']: sLog.error("No information found for %s" % executionEnvironments) continue listOfValues += response['Value'] if not listOfValues: return S_ERROR("No information found for executionEnvironments") return S_OK(listOfValues)
def getReplicasPresence(self, lfns): """ get the replicas using the standard FileCatalog.getReplicas() """ present = set() notPresent = set() chunkSize = 100 printProgress = (len(lfns) > chunkSize) startTime = time.time() self.__write( "Checking replicas for %d files%s" % (len(lfns), (' (chunks of %d)' % chunkSize) if printProgress else '... ')) for chunk in breakListIntoChunks(lfns, chunkSize): if printProgress: self.__write('.') for _ in xrange(1, 10): res = self.fc.getReplicas(chunk) if res['OK']: present.update(res['Value']['Successful']) self.cachedReplicas.update(res['Value']['Successful']) notPresent.update(res['Value']['Failed']) break else: time.sleep(0.1) self.__write(' (%.1f seconds)\n' % (time.time() - startTime)) if notPresent: self.__logVerbose("Files without replicas:", '\n'.join([''] + sorted(notPresent))) return list(present), list(notPresent)
def _getJobStatusOnHost(self, jobIDList, host=None): """ Get the status information for the given list of jobs """ resultDict = {} jobDict = {} for job in jobIDList: stamp = os.path.basename(urlparse(job).path) jobDict[stamp] = job stampList = jobDict.keys() for jobList in breakListIntoChunks(stampList, 100): resultCommand = self.__executeHostCommand('getJobStatus', {'JobIDList': jobList}, host=host) if not resultCommand['OK']: return resultCommand result = resultCommand['Value'] if result['Status'] != 0: return S_ERROR('Failed to get job status: %s' % result['Message']) for stamp in result['Jobs']: resultDict[jobDict[stamp]] = result['Jobs'][stamp] return S_OK(resultDict)
def _splitByData(self): """a job is submitted per input data. :return: parameter name and parameter values for setParameterSequence() :rtype: tuple of (str, list, bool/str) """ # reset split attribute to avoid infinite loop self.splittingOption = None self.log.info("Job splitting: Splitting 'byData' method...") # Ensure that data have been specified by setInputData() method if not self._data: errorMessage = "Job splitting: missing input data" self.log.error(errorMessage) return False if self.numberOfFilesPerJob > len(self._data): errorMessage = "Job splitting: 'numberOfFilesPerJob' must be less/equal than the number of input data" self.log.error(errorMessage) return False self._data = breakListIntoChunks(self._data, self.numberOfFilesPerJob) self.log.info("Job splitting: submission consists of %d job(s)" % len(self._data)) return ["InputData", self._data, 'ParametricInputData']
def _Broadcast( self ): """ This plug-in takes files found at the sourceSE and broadcasts to all (or a selection of) targetSEs. """ if not self.params: return S_ERROR( "TransformationPlugin._Broadcast: The 'Broadcast' plugin requires additional parameters." ) targetseParam = self.params['TargetSE'] targetSEs = [] sourceSEs = eval( self.params['SourceSE'] ) if targetseParam.count( '[' ): targetSEs = eval( targetseParam ) elif isinstance( targetseParam, list ): targetSEs = targetseParam else: targetSEs = [targetseParam] # sourceSEs = eval(self.params['SourceSE']) # targetSEs = eval(self.params['TargetSE']) destinations = int( self.params.get( 'Destinations', 0 ) ) if destinations and ( destinations >= len( targetSEs ) ): destinations = 0 status = self.params['Status'] groupSize = self.params['GroupSize'] # Number of files per tasks fileGroups = getFileGroups( self.data ) # groups by SE targetSELfns = {} for replicaSE, lfns in fileGroups.items(): ses = replicaSE.split( ',' ) # sourceSites = self._getSitesForSEs(ses) atSource = False for se in ses: if se in sourceSEs: atSource = True if not atSource: continue for lfn in lfns: targets = [] sources = self._getSitesForSEs( ses ) random.shuffle( targetSEs ) for targetSE in targetSEs: site = self._getSiteForSE( targetSE )['Value'] if not site in sources: if ( destinations ) and ( len( targets ) >= destinations ): continue sources.append( site ) targets.append( targetSE ) # after all, if someone wants to copy to the source, it's his choice strTargetSEs = str.join( ',', sorted( targets ) ) if not targetSELfns.has_key( strTargetSEs ): targetSELfns[strTargetSEs] = [] targetSELfns[strTargetSEs].append( lfn ) tasks = [] for ses, lfns in targetSELfns.items(): tasksLfns = breakListIntoChunks( lfns, groupSize ) for taskLfns in tasksLfns: if ( status == 'Flush' ) or ( len( taskLfns ) >= int( groupSize ) ): # do not allow groups smaller than the groupSize, except if transformation is in flush state tasks.append( ( ses, taskLfns ) ) return S_OK( tasks )
def _Broadcast( self ): """ This plug-in takes files found at the sourceSE and broadcasts to all (or a selection of) targetSEs. """ if not self.params: return S_ERROR( "TransformationPlugin._Broadcast: The 'Broadcast' plugin requires additional parameters." ) targetseParam = self.params['TargetSE'] targetSEs = [] sourceSEs = eval( self.params['SourceSE'] ) if targetseParam.count( '[' ): targetSEs = eval( targetseParam ) elif isinstance( targetseParam, list ): targetSEs = targetseParam else: targetSEs = [targetseParam] # sourceSEs = eval(self.params['SourceSE']) # targetSEs = eval(self.params['TargetSE']) destinations = int( self.params.get( 'Destinations', 0 ) ) if destinations and ( destinations >= len( targetSEs ) ): destinations = 0 status = self.params['Status'] groupSize = self.params['GroupSize'] # Number of files per tasks fileGroups = getFileGroups( self.data ) # groups by SE targetSELfns = {} for replicaSE, lfns in fileGroups.items(): ses = replicaSE.split( ',' ) # sourceSites = self._getSitesForSEs(ses) atSource = False for se in ses: if se in sourceSEs: atSource = True if not atSource: continue for lfn in lfns: targets = [] sources = self._getSitesForSEs( ses ) random.shuffle( targetSEs ) for targetSE in targetSEs: site = self._getSiteForSE( targetSE )['Value'] if not site in sources: if ( destinations ) and ( len( targets ) >= destinations ): continue sources.append( site ) targets.append( targetSE ) # after all, if someone wants to copy to the source, it's his choice strTargetSEs = str.join( ',', sorted( targets ) ) if not targetSELfns.has_key( strTargetSEs ): targetSELfns[strTargetSEs] = [] targetSELfns[strTargetSEs].append( lfn ) tasks = [] for ses, lfns in targetSELfns.items(): tasksLfns = breakListIntoChunks( lfns, groupSize ) for taskLfns in tasksLfns: if ( status == 'Flush' ) or ( len( taskLfns ) >= int( groupSize ) ): # do not allow groups smaller than the groupSize, except if transformation is in flush state tasks.append( ( ses, taskLfns ) ) return S_OK( tasks )
def groupByReplicas( self, files, status ): """ Generates tasks based on the location of the input data :param dict fileReplicas: {'/this/is/at.1': ['SE1'], '/this/is/at.12': ['SE1', 'SE2'], '/this/is/at.2': ['SE2'], '/this/is/at_123': ['SE1', 'SE2', 'SE3'], '/this/is/at_23': ['SE2', 'SE3'], '/this/is/at_4': ['SE4']} """ tasks = [] nTasks = 0 if not len( files ): return S_OK( tasks ) files = dict( files ) # Parameters if not self.groupSize: self.groupSize = self.getPluginParam( 'GroupSize', 10 ) flush = ( status == 'Flush' ) self.logVerbose( "groupByReplicas: %d files, groupSize %d, flush %s" % ( len( files ), self.groupSize, flush ) ) # Consider files by groups of SEs, a file is only in one group # Then consider files site by site, but a file can now be at more than one site for groupSE in ( True, False ): if not files: break seFiles = getFileGroups( files, groupSE = groupSE ) self.logDebug( "fileGroups set: ", seFiles ) for replicaSE in sortSEs( seFiles ): lfns = seFiles[replicaSE] if lfns: tasksLfns = breakListIntoChunks( lfns, self.groupSize ) lfnsInTasks = [] for taskLfns in tasksLfns: if ( flush and not groupSE ) or ( len( taskLfns ) >= self.groupSize ): tasks.append( ( replicaSE, taskLfns ) ) lfnsInTasks += taskLfns # In case the file was at more than one site, remove it from the other sites' list # Remove files from global list for lfn in lfnsInTasks: files.pop( lfn ) if not groupSE: # Remove files from other SEs for se in [se for se in seFiles if se != replicaSE]: seFiles[se] = [lfn for lfn in seFiles[se] if lfn not in lfnsInTasks] self.logVerbose( "groupByReplicas: %d tasks created (groupSE %s), %d files not included in tasks" % ( len( tasks ) - nTasks, str( groupSE ), len( files ) ) ) nTasks = len( tasks ) return S_OK( tasks )
def groupByReplicas( self, files, status ): """ Generates tasks based on the location of the input data :param dict fileReplicas: {'/this/is/at.1': ['SE1'], '/this/is/at.12': ['SE1', 'SE2'], '/this/is/at.2': ['SE2'], '/this/is/at_123': ['SE1', 'SE2', 'SE3'], '/this/is/at_23': ['SE2', 'SE3'], '/this/is/at_4': ['SE4']} """ tasks = [] nTasks = 0 if not len( files ): return S_OK( tasks ) files = dict( files ) # Parameters if not self.groupSize: self.groupSize = self.getPluginParam( 'GroupSize', 10 ) flush = ( status == 'Flush' ) self.logVerbose( "groupByReplicas: %d files, groupSize %d, flush %s" % ( len( files ), self.groupSize, flush ) ) # Consider files by groups of SEs, a file is only in one group # Then consider files site by site, but a file can now be at more than one site for groupSE in ( True, False ): if not files: break seFiles = getFileGroups( files, groupSE = groupSE ) self.logDebug( "fileGroups set: ", seFiles ) for replicaSE in sortSEs( seFiles ): lfns = seFiles[replicaSE] if lfns: tasksLfns = breakListIntoChunks( lfns, self.groupSize ) lfnsInTasks = [] for taskLfns in tasksLfns: if ( flush and not groupSE ) or ( len( taskLfns ) >= self.groupSize ): tasks.append( ( replicaSE, taskLfns ) ) lfnsInTasks += taskLfns # In case the file was at more than one site, remove it from the other sites' list # Remove files from global list for lfn in lfnsInTasks: files.pop( lfn ) if not groupSE: # Remove files from other SEs for se in [se for se in seFiles if se != replicaSE]: seFiles[se] = [lfn for lfn in seFiles[se] if lfn not in lfnsInTasks] self.logVerbose( "groupByReplicas: %d tasks created (groupSE %s), %d files not included in tasks" % ( len( tasks ) - nTasks, str( groupSE ), len( files ) ) ) nTasks = len( tasks ) return S_OK( tasks )
def _Broadcast(self): """This plug-in takes files found at the sourceSE and broadcasts to all (or a selection of) targetSEs. Parameters used by this plugin: * SourceSE: Optional: only files at this location are treated * TargetSE: Where to broadcast files to * Destinations: Optional: integer, files are only broadcast to this number of TargetSEs, Destinations has to be larger than the number of TargetSEs * GroupSize: number of files per task """ if not self.params: return S_ERROR( "TransformationPlugin._Broadcast: The 'Broadcast' plugin requires additional parameters." ) sourceSEs = set( self.util.seParamtoList(self.params.get("SourceSE", []))) targetSEs = self.util.seParamtoList(self.params["TargetSE"]) destinations = int(self.params.get("Destinations", 0)) if destinations and (destinations >= len(targetSEs)): destinations = 0 status = self.params["Status"] groupSize = self.params["GroupSize"] # Number of files per tasks fileGroups = getFileGroups(self.data) # groups by SE targetSELfns = {} for replicaSE, lfns in fileGroups.items(): ses = replicaSE.split(",") atSource = (not sourceSEs) or set(ses).intersection(sourceSEs) if not atSource: continue for lfn in lfns: targets = [] sourceSites = self._getSitesForSEs(ses) random.shuffle(targetSEs) for targetSE in targetSEs: site = self._getSiteForSE(targetSE)["Value"] if site not in sourceSites: if (destinations) and (len(targets) >= destinations): continue sourceSites.append(site) targets.append( targetSE ) # after all, if someone wants to copy to the source, it's his choice strTargetSEs = ",".join(sorted(targets)) targetSELfns.setdefault(strTargetSEs, []).append(lfn) tasks = [] for ses, lfns in targetSELfns.items(): tasksLfns = breakListIntoChunks(lfns, groupSize) for taskLfns in tasksLfns: if (status == "Flush") or (len(taskLfns) >= int(groupSize)): # do not allow groups smaller than the groupSize, except if transformation is in flush state tasks.append((ses, taskLfns)) return S_OK(tasks)
def prepareNewJobs(self, maxFilesPerJob=100, maxAttemptsPerFile=10): log = self._log.getSubLogger("_prepareNewJobs", child=True) filesToSubmit = self._getFilesToSubmit( maxAttemptsPerFile=maxAttemptsPerFile) log.debug("%s ftsFiles to submit" % len(filesToSubmit)) newJobs = [] # {targetSE : [FTS3Files] } res = FTS3Utilities.groupFilesByTarget(filesToSubmit) if not res['OK']: return res filesGroupedByTarget = res['Value'] for targetSE, ftsFiles in filesGroupedByTarget.iteritems(): res = self._checkSEAccess(targetSE, 'WriteAccess', vo=self.vo) if not res['OK']: # If the SE is currently banned, we just skip it if cmpError(res, errno.EACCES): log.info( "Write access currently not permitted to %s, skipping." % targetSE) else: log.error(res) for ftsFile in ftsFiles: ftsFile.attempt += 1 continue sourceSEs = self.sourceSEs.split( ',') if self.sourceSEs is not None else [] # { sourceSE : [FTSFiles] } res = FTS3Utilities.selectUniqueRandomSource( ftsFiles, allowedSources=sourceSEs) if not res['OK']: return res uniqueTransfersBySource = res['Value'] # We don't need to check the source, since it is already filtered by the DataManager for sourceSE, ftsFiles in uniqueTransfersBySource.iteritems(): for ftsFilesChunk in breakListIntoChunks( ftsFiles, maxFilesPerJob): newJob = self._createNewJob('Transfer', ftsFilesChunk, targetSE, sourceSE=sourceSE) newJobs.append(newJob) return S_OK(newJobs)
def _getFileReplicas(self, fileIDs, fields_input=None, allStatus=False, connection=False): """Get replicas for the given list of files specified by their fileIDs :param fileIDs : list of file ids :param fields_input : metadata of the Replicas we are interested in (default to PFN) :param allStatus : if True, all the Replica statuses will be considered, otherwise, only the db.visibleReplicaStatus :returns S_OK with a dict { fileID : { SE name : dict of metadata } } """ if fields_input is None: fields_input = ["PFN"] fields = list(fields_input) # always add Status in the list of required fields if "Status" not in fields: fields.append("Status") # We initialize the dictionary with empty dict # as default value, because this is what we want for # non existing replicas replicas = {fileID: {} for fileID in fileIDs} # Format the status to be used in a IN clause in the stored procedure fStatus = stringListToString(self.db.visibleReplicaStatus) fieldNames = [ "FileID", "SE", "Status", "RepType", "CreationDate", "ModificationDate", "PFN" ] for chunks in breakListIntoChunks(fileIDs, 1000): # Format the FileIDs to be used in a IN clause in the stored procedure formatedFileIds = intListToString(chunks) result = self.db.executeStoredProcedureWithCursor( "ps_get_all_info_of_replicas_bulk", (formatedFileIds, allStatus, fStatus)) if not result["OK"]: return result rows = result["Value"] for row in rows: rowDict = dict(zip(fieldNames, row)) se = rowDict["SE"] fileID = rowDict["FileID"] replicas[fileID][se] = dict( (key, rowDict.get(key, "Unknown metadata field")) for key in fields) return S_OK(replicas)
def getFileMetadata(self, lfns, ownership=False): """Returns the file metadata associated to a supplied LFN""" successful, failed = {}, {} lfnChunks = breakListIntoChunks(lfns, 1000) listFiles = deepcopy(list(lfns)) for chunk in lfnChunks: try: dids = [self.__getDidsFromLfn(lfn) for lfn in chunk] for meta in self.client.get_metadata_bulk(dids): lfn = str(meta["name"]) if meta["did_type"] in ["DATASET", "CONTAINER"]: nlinks = len([ child for child in self.client.list_content( meta["scope"], meta["name"]) ]) successful[lfn] = { "Checksum": "", "ChecksumType": "", "CreationDate": meta["created_at"], "GUID": "", "Mode": 509, "ModificationDate": meta["updated_at"], "NumberOfLinks": nlinks, "Size": 0, "Status": "-", } try: listFiles.remove(lfn) except ValueError: pass else: guid = meta["guid"] if guid: guid = str(uuid.UUID(guid)) successful[lfn] = { "Checksum": str(meta["adler32"]), "ChecksumType": "AD", "CreationDate": meta["created_at"], "GUID": guid, "Mode": 436, "ModificationDate": meta["updated_at"], "NumberOfLinks": 1, "Size": meta["bytes"], "Status": "-", } try: listFiles.remove(lfn) except ValueError: pass except DataIdentifierNotFound as err: failed[lfn] = str(err) except Exception as err: return S_ERROR(str(err)) for lfn in listFiles: failed[lfn] = "No such file or directory" resDict = {"Failed": failed, "Successful": successful} return S_OK(resDict)
def __insertExistingTransformationFiles(self, transID, fileTuplesList, connection=False): """ extends DIRAC.__insertExistingTransformationFiles Does not add userSE and adds runNumber """ gLogger.info("Inserting %d files in TransformationFiles" % len(fileTuplesList)) # splitting in various chunks, in case it is too big for fileTuples in breakListIntoChunks(fileTuplesList, 10000): gLogger.verbose( "Adding first %d files in TransformationFiles (out of %d)" % (len(fileTuples), len(fileTuplesList))) req = "INSERT INTO TransformationFiles (TransformationID,Status,TaskID,FileID, \ TargetSE,LastUpdate,RunNumber,Size,FileType,RAWAncestors) VALUES" candidates = False for ft in fileTuples: _lfn, originalID, fileID, status, taskID, targetSE, _usedSE, _errorCount, _lastUpdate, \ _insertTime, runNumber, size, fileType, rawAncestors = ft[:14] if status not in ('Removed', ): candidates = True if not re.search('-', status): status = "%s-inherited" % status if taskID: taskID = 1000000 * int(originalID) + int(taskID) req = "%s (%d,'%s',%s,%d,'%s',UTC_TIMESTAMP(),%s,%s,'%s',%s)," % ( req, transID, status, taskID, fileID, targetSE, runNumber, size, fileType, rawAncestors) if not candidates: continue req = req.rstrip(",") res = self._update(req, connection) if not res['OK']: return res # We must also copy the run table entries if any result = self.getTransformationRuns({'TransformationID': originalID}) if not result['OK']: return result for runDict in res['Value']: runID = runDict['RunNumber'] selectedSite = runDict['SelectedSite'] status = runDict['Status'] res = self.insertTransformationRun(transID, runID, selectedSite=selectedSite, status=status, connection=connection) if not res['OK']: return res return S_OK()
def removeRemoteFiles(dm,lfns): """ Remove file from the catalog """ for lfnList in breakListIntoChunks( lfns, 100 ): res = dm.removeFile( lfnList ) if not res['OK']: return S_ERROR( "Failed to remove files:" + lfnList + res['Message'] ) else: return S_OK()
def removeRemoteFiles(dm, lfns): """ Remove file from the catalog """ for lfnList in breakListIntoChunks(lfns, 100): res = dm.removeFile(lfnList) if not res["OK"]: return S_ERROR("Failed to remove files:" + lfnList + res["Message"]) else: return S_OK()
def getJobStatus(self, jobIDList): """Get the status information for the given list of jobs""" # If we use a local schedd, then we have to cleanup executables regularly if self.useLocalSchedd: self.__cleanup() self.log.verbose("Job ID List for status: %s " % jobIDList) if isinstance(jobIDList, six.string_types): jobIDList = [jobIDList] resultDict = {} condorIDs = {} # Get all condorIDs so we can just call condor_q and condor_history once for jobRef in jobIDList: job, _, jobID = condorIDAndPathToResultFromJobRef(jobRef) condorIDs[job] = jobID qList = [] for _condorIDs in breakListIntoChunks(condorIDs.values(), 100): # This will return a list of 1245.75 3 status, stdout_q = commands.getstatusoutput( "condor_q %s %s -af:j JobStatus " % (self.remoteScheddOptions, " ".join(_condorIDs))) if status != 0: return S_ERROR(stdout_q) _qList = stdout_q.strip().split("\n") qList.extend(_qList) # FIXME: condor_history does only support j for autoformat from 8.5.3, # format adds whitespace for each field This will return a list of 1245 75 3 # needs to cocatenate the first two with a dot condorHistCall = "condor_history %s %s -af ClusterId ProcId JobStatus" % ( self.remoteScheddOptions, " ".join(_condorIDs), ) treatCondorHistory(condorHistCall, qList) for job, jobID in condorIDs.items(): pilotStatus = parseCondorStatus(qList, jobID) if pilotStatus == "HELD": # make sure the pilot stays dead and gets taken out of the condor_q _rmStat, _rmOut = commands.getstatusoutput( "condor_rm %s %s " % (self.remoteScheddOptions, jobID)) # self.log.debug( "condor job killed: job %s, stat %s, message %s " % ( jobID, rmStat, rmOut ) ) pilotStatus = PilotStatus.ABORTED resultDict[job] = pilotStatus self.log.verbose("Pilot Statuses: %s " % resultDict) return S_OK(resultDict)
def getLFNStatus(self, jobs): """Get all the LFNs for the jobs and get their status.""" self.log.notice('Collecting LFNs...') lfnExistence = {} lfnCache = [] counter = 0 jobInfoStart = time.time() for counter, job in enumerate(jobs.values()): if counter % self.printEveryNJobs == 0: self.log.notice( 'Getting JobInfo: %d/%d: %3.1fs' % (counter, len(jobs), float(time.time() - jobInfoStart))) while True: try: job.getJobInformation(self.diracAPI, self.jobMon, jdlOnly=self.getJobInfoFromJDLOnly) lfnCache.extend(job.inputFiles) lfnCache.extend(job.outputFiles) break except RuntimeError as e: # try again self.log.error('+++++ Failure for job:', job.jobID) self.log.error('+++++ Exception: ', str(e)) timeSpent = float(time.time() - jobInfoStart) self.log.notice('Getting JobInfo Done: %3.1fs (%3.3fs per job)' % (timeSpent, timeSpent / counter)) counter = 0 fileInfoStart = time.time() for lfnChunk in breakListIntoChunks(list(lfnCache), 200): counter += 200 if counter % 1000 == 0: self.log.notice('Getting FileInfo: %d/%d: %3.1fs' % (counter, len(lfnCache), float(time.time() - fileInfoStart))) while True: try: reps = self.fcClient.exists(lfnChunk) if not reps['OK']: self.log.error( 'Failed to check file existence, try again...', reps['Message']) raise RuntimeError('Try again') statuses = reps['Value'] lfnExistence.update(statuses['Successful']) break except RuntimeError: # try again pass self.log.notice('Getting FileInfo Done: %3.1fs' % (float(time.time() - fileInfoStart))) return lfnExistence
def __removeWMSTasks( self, jobIDs ): allRemove = True for jobList in breakListIntoChunks( jobIDs, 500 ): res = self.wmsClient.killJob( jobList ) if res['OK']: gLogger.info( "Successfully killed %d jobs from WMS" % len( jobList ) ) elif ( res.has_key( 'InvalidJobIDs' ) ) and ( not res.has_key( 'NonauthorizedJobIDs' ) ) and ( not res.has_key( 'FailedJobIDs' ) ): gLogger.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) ) elif res.has_key( 'NonauthorizedJobIDs' ): gLogger.error( "Failed to kill %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) ) allRemove = False elif res.has_key( 'FailedJobIDs' ): gLogger.error( "Failed to kill %s jobs" % len( res['FailedJobIDs'] ) ) allRemove = False res = self.wmsClient.deleteJob( jobList ) if res['OK']: gLogger.info( "Successfully removed %d jobs from WMS" % len( jobList ) ) elif ( res.has_key( 'InvalidJobIDs' ) ) and ( not res.has_key( 'NonauthorizedJobIDs' ) ) and ( not res.has_key( 'FailedJobIDs' ) ): gLogger.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) ) elif res.has_key( 'NonauthorizedJobIDs' ): gLogger.error( "Failed to remove %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) ) allRemove = False elif res.has_key( 'FailedJobIDs' ): gLogger.error( "Failed to remove %s jobs" % len( res['FailedJobIDs'] ) ) allRemove = False if not allRemove: return S_ERROR( "Failed to remove all remnants from WMS" ) gLogger.info( "Successfully removed all tasks from the WMS" ) res = self.requestClient.getRequestForJobs( jobIDs ) if not res['OK']: gLogger.error( "Failed to get requestID for jobs.", res['Message'] ) return res failoverRequests = res['Value'] gLogger.info( "Found %d jobs with associated failover requests" % len( failoverRequests ) ) if not failoverRequests: return S_OK() failed = 0 for jobID, requestName in failoverRequests.items(): res = self.requestClient.deleteRequest( requestName ) if not res['OK']: gLogger.error( "Failed to remove request from RequestDB", res['Message'] ) failed += 1 else: gLogger.verbose( "Removed request %s associated to job %d." % ( requestName, jobID ) ) if failed: gLogger.info( "Successfully removed %s requests" % ( len( failoverRequests ) - failed ) ) gLogger.info( "Failed to remove %s requests" % failed ) return S_ERROR( "Failed to remove all the request from RequestDB" ) gLogger.info( "Successfully removed all the associated failover requests" ) return S_OK()
def removeFile(self, path): """Remove physically the file specified by its path """ res = self.__checkArgumentFormat(path) if not res['OK']: return res urls = res['Value'] successful = {} failed = {} listOfLists = breakListIntoChunks(urls, 100) for urls in listOfLists: gLogger.debug( "RFIOStorage.removeFile: Attempting to remove %s files." % len(urls)) comm = 'stager_rm -S %s' % self.spaceToken for url in urls: comm = "%s -M %s" % (comm, url) res = shellCall(100, comm) if res['OK']: returncode, _stdout, stderr = res['Value'] if returncode in [0, 1]: comm = 'nsrm -f' for url in urls: comm = "%s %s" % (comm, url) res = shellCall(100, comm) if res['OK']: returncode, _stdout, stderr = res['Value'] if returncode in [0, 1]: for pfn in urls: successful[pfn] = True else: errStr = "RFIOStorage.removeFile. Completely failed to remove files from the nameserver." gLogger.error(errStr, stderr) for pfn in urls: failed[pfn] = errStr else: errStr = "RFIOStorage.removeFile. Completely failed to remove files from the nameserver." gLogger.error(errStr, res['Message']) for pfn in urls: failed[pfn] = errStr else: errStr = "RFIOStorage.removeFile. Completely failed to remove files from the stager." gLogger.error(errStr, stderr) for pfn in urls: failed[pfn] = errStr else: errStr = "RFIOStorage.removeFile. Completely failed to remove files from the stager." gLogger.error(errStr, res['Message']) for pfn in urls: failed[pfn] = errStr resDict = {'Failed': failed, 'Successful': successful} return S_OK(resDict)
def __verifyPfns( self, pfnSizes, storageElements ): gLogger.info( 'Checking %s storage files exist in the catalog' % len( pfnSizes ) ) pfnsToRemove = [] incorrectlyRegistered = [] allDone = True # First get all the PFNs as they should be registered in the catalog for pfns in breakListIntoChunks( sortList( pfnSizes.keys() ), 100 ): res = self.replicaManager.getPfnForProtocol( pfns, storageElements[0], withPort = False ) if not res['OK']: allDone = False continue for pfn, error in res['Value']['Failed'].items(): gLogger.error( 'Failed to obtain registered PFN for physical file', '%s %s' % ( pfn, error ) ) if res['Value']['Failed']: allDone = False catalogStoragePfns = res['Value']['Successful'] # Determine whether these PFNs are registered and if so obtain the LFN res = self.replicaManager.getCatalogLFNForPFN( catalogStoragePfns.values() ) if not res['OK']: allDone = False continue for surl in sortList( res['Value']['Failed'].keys() ): if res['Value']['Failed'][surl] == 'No such file or directory': #pfnsToRemove.append(surl) print surl else: gLogger.error( 'Failed to get LFN for PFN', '%s %s' % ( surl, res['Value']['Failed'][surl] ) ) existingLFNs = res['Value']['Successful'].values() if existingLFNs: res = self.replicaManager.getCatalogReplicas( existingLFNs ) if not res['OK']: allDone = False continue for lfn, error in res['Value']['Failed'].items(): gLogger.error( 'Failed to obtain registered replicas for LFN', '%s %s' % ( lfn, error ) ) if res['Value']['Failed']: allDone = False for lfn, replicas in res['Value']['Successful'].items(): match = False for storageElement in storageElements: if storageElement in replicas.keys(): match = True if not match: pass#incorrectlyRegistered.append(lfn) #print lfn gLogger.info( "Verification of PFNs complete" ) if incorrectlyRegistered: gLogger.info( "Found %d files incorrectly registered" % len( incorrectlyRegistered ) ) if pfnsToRemove: gLogger.info( "Found %d files to be removed" % len( pfnsToRemove ) ) resDict = {'Remove':pfnsToRemove, 'ReRegister':incorrectlyRegistered, 'AllDone':allDone} return S_OK( resDict )
def removeFile( self, lfns ): rpcClient = self._getRPC() successful = {} failed = {} listOfLists = breakListIntoChunks( lfns, 100 ) for fList in listOfLists: res = rpcClient.removeFile( fList ) if not res['OK']: return res successful.update( res['Value']['Successful'] ) failed.update( res['Value']['Failed'] ) resDict = {'Successful': successful, 'Failed':failed} return S_OK( resDict )
def removeFile(self, lfns): rpcClient = self._getRPC() successful = {} failed = {} listOfLists = breakListIntoChunks(lfns, 100) for fList in listOfLists: res = rpcClient.removeFile(fList) if not res['OK']: return res successful.update(res['Value']['Successful']) failed.update(res['Value']['Failed']) resDict = {'Successful': successful, 'Failed': failed} return S_OK(resDict)
def removeFile(self, lfns): rpcClient = self._getRPC() successful = {} failed = {} listOfLists = breakListIntoChunks(lfns, 100) for fList in listOfLists: res = rpcClient.removeFile(fList) if not res["OK"]: return res successful.update(res["Value"]["Successful"]) failed.update(res["Value"]["Failed"]) resDict = {"Successful": successful, "Failed": failed} return S_OK(resDict)
def __removeWMSTasks(self, jobIDs): allRemove = True for jobList in breakListIntoChunks(jobIDs, 500): res = self.wmsClient.deleteJob(jobList) if res['OK']: gLogger.info("Successfully removed %d jobs from WMS" % len(jobList)) elif (res.has_key('InvalidJobIDs')) and ( not res.has_key('NonauthorizedJobIDs')) and ( not res.has_key('FailedJobIDs')): gLogger.info("Found %s jobs which did not exist in the WMS" % len(res['InvalidJobIDs'])) elif res.has_key('NonauthorizedJobIDs'): gLogger.error( "Failed to remove %s jobs because not authorized" % len(res['NonauthorizedJobIDs'])) allRemove = False elif res.has_key('FailedJobIDs'): gLogger.error("Failed to remove %s jobs" % len(res['FailedJobIDs'])) allRemove = False if not allRemove: return S_ERROR("Failed to remove all remnants from WMS") gLogger.info("Successfully removed all tasks from the WMS") res = self.requestClient.getRequestForJobs(jobIDs) if not res['OK']: gLogger.error("Failed to get requestID for jobs.", res['Message']) return res failoverRequests = res['Value'] gLogger.info("Found %d jobs with associated failover requests" % len(failoverRequests)) if not failoverRequests: return S_OK() failed = 0 for jobID, requestName in failoverRequests.items(): res = self.requestClient.deleteRequest(requestName) if not res['OK']: gLogger.error("Failed to remove request from RequestDB", res['Message']) failed += 1 else: gLogger.verbose("Removed request %s associated to job %d." % (requestName, jobID)) if failed: gLogger.info("Successfully removed %s requests" % (len(failoverRequests) - failed)) gLogger.info("Failed to remove %s requests" % failed) return S_ERROR("Failed to remove all the request from RequestDB") gLogger.info( "Successfully removed all the associated failover requests") return S_OK()
def __exists(self, lfns): server = RPCClient(self.url, timeout=120) successful = {} failed = {} for lfnList in breakListIntoChunks(lfns, self.splitSize): res = server.exists(lfnList) if not res['OK']: for lfn in lfnList: failed[lfn] = res['Message'] else: for lfn, exists in res['Value'].items(): successful[lfn] = exists resDict = {'Successful': successful, 'Failed': {}} return S_OK(resDict)
def __exists(self, lfns): server = RPCClient(self.url, timeout=120) successful = {} failed = {} for lfnList in breakListIntoChunks(lfns, self.splitSize): res = server.exists(lfnList) if not res["OK"]: for lfn in lfnList: failed[lfn] = res["Message"] else: for lfn, exists in res["Value"].items(): successful[lfn] = exists resDict = {"Successful": successful, "Failed": {}} return S_OK(resDict)
def __exists( self, lfns ): server = RPCClient( self.url, timeout = 120 ) successful = {} failed = {} for lfnList in breakListIntoChunks( lfns, self.splitSize ): res = server.exists( lfnList ) if not res['OK']: for lfn in lfnList: failed[lfn] = res['Message'] else: for lfn, exists in res['Value'].items(): successful[lfn] = exists resDict = {'Successful':successful, 'Failed':{}} return S_OK( resDict )
def removeFile( self, path ): """Remove physically the file specified by its path """ res = self.__checkArgumentFormat( path ) if not res['OK']: return res urls = res['Value'] successful = {} failed = {} listOfLists = breakListIntoChunks( urls, 100 ) for urls in listOfLists: gLogger.debug( "RFIOStorage.removeFile: Attempting to remove %s files." % len( urls ) ) comm = 'stager_rm -S %s' % self.spaceToken for url in urls: comm = "%s -M %s" % ( comm, url ) res = shellCall( 100, comm ) if res['OK']: returncode, _stdout, stderr = res['Value'] if returncode in [0, 1]: comm = 'nsrm -f' for url in urls: comm = "%s %s" % ( comm, url ) res = shellCall( 100, comm ) if res['OK']: returncode, _stdout, stderr = res['Value'] if returncode in [0, 1]: for pfn in urls: successful[pfn] = True else: errStr = "RFIOStorage.removeFile. Completely failed to remove files from the nameserver." gLogger.error( errStr, stderr ) for pfn in urls: failed[pfn] = errStr else: errStr = "RFIOStorage.removeFile. Completely failed to remove files from the nameserver." gLogger.error( errStr, res['Message'] ) for pfn in urls: failed[pfn] = errStr else: errStr = "RFIOStorage.removeFile. Completely failed to remove files from the stager." gLogger.error( errStr, stderr ) for pfn in urls: failed[pfn] = errStr else: errStr = "RFIOStorage.removeFile. Completely failed to remove files from the stager." gLogger.error( errStr, res['Message'] ) for pfn in urls: failed[pfn] = errStr resDict = {'Failed':failed, 'Successful':successful} return S_OK( resDict )
def addFile(self, lfns): """Register supplied files""" failed = {} successful = {} deterministicDictionary = {} for lfnList in breakListIntoChunks(lfns, 100): listLFNs = [] for lfn in list(lfnList): lfnInfo = lfns[lfn] pfn = None se = lfnInfo["SE"] if se not in deterministicDictionary: isDeterministic = self.client.get_rse(se)["deterministic"] deterministicDictionary[se] = isDeterministic if not deterministicDictionary[se]: pfn = lfnInfo["PFN"] size = lfnInfo["Size"] guid = lfnInfo.get("GUID", None) checksum = lfnInfo["Checksum"] rep = { "lfn": lfn, "bytes": size, "adler32": checksum, "rse": se } if pfn: rep["pfn"] = pfn if guid: rep["guid"] = guid listLFNs.append(rep) try: self.client.add_files(lfns=listLFNs, ignore_availability=True) for lfn in list(lfnList): successful[lfn] = True except Exception as err: # Try inserting one by one sLog.warn("Cannot bulk insert files", "error : %s" % repr(err)) for lfn in listLFNs: try: self.client.add_files(lfns=[lfn], ignore_availability=True) successful[lfn["lfn"]] = True except FileReplicaAlreadyExists: successful[lfn["lfn"]] = True except Exception as err: failed[lfn["lfn"]] = str(err) resDict = {"Failed": failed, "Successful": successful} sLog.debug(resDict) return S_OK(resDict)
def getJobStatus( self, jobIDList ): """ Get the status information for the given list of jobs """ resultDict = {} ssh = SSH( parameters = self.ceParameters ) for jobList in breakListIntoChunks( jobIDList, 100 ): jobDict = {} for job in jobList: result = pfnparse( job ) if result['OK']: stamp = result['Value']['FileName'].split('.')[0] else: self.log.error( 'Invalid job id', job ) continue jobDict[stamp] = job stampList = jobDict.keys() cmd = [ 'qstat', ' '.join( stampList ) ] result = ssh.sshCall( 10, cmd ) if not result['OK']: return result status = result['Value'][0] if status == -1: return S_ERROR( 'Timeout while SSH call' ) elif status != 0: return S_ERROR( 'Error while SSH call' ) output = result['Value'][1].replace( '\r', '' ) lines = output.split( '\n' ) for job in jobDict: resultDict[jobDict[job]] = 'Unknown' for line in lines: if line.find( job ) != -1: if line.find( 'Unknown' ) != -1: resultDict[jobDict[job]] = 'Unknown' else: torqueStatus = line.split()[4] if torqueStatus in ['E', 'C']: resultDict[jobDict[job]] = 'Done' elif torqueStatus in ['R']: resultDict[jobDict[job]] = 'Running' elif torqueStatus in ['S', 'W', 'Q', 'H', 'T']: resultDict[jobDict[job]] = 'Waiting' return S_OK( resultDict )
def getJobStatus(self, jobIDList): """ Get the status information for the given list of jobs """ resultDict = {} ssh = SSH(parameters=self.ceParameters) for jobList in breakListIntoChunks(jobIDList, 100): jobDict = {} for job in jobList: result = pfnparse(job) if result['OK']: stamp = result['Value']['FileName'].split('.')[0] else: self.log.error('Invalid job id', job) continue jobDict[stamp] = job stampList = jobDict.keys() cmd = ['qstat', ' '.join(stampList)] result = ssh.sshCall(10, cmd) if not result['OK']: return result status = result['Value'][0] if status == -1: return S_ERROR('Timeout while SSH call') elif status != 0: return S_ERROR('Error while SSH call') output = result['Value'][1].replace('\r', '') lines = output.split('\n') for job in jobDict: resultDict[jobDict[job]] = 'Unknown' for line in lines: if line.find(job) != -1: if line.find('Unknown') != -1: resultDict[jobDict[job]] = 'Unknown' else: torqueStatus = line.split()[4] if torqueStatus in ['E', 'C']: resultDict[jobDict[job]] = 'Done' elif torqueStatus in ['R']: resultDict[jobDict[job]] = 'Running' elif torqueStatus in ['S', 'W', 'Q', 'H', 'T']: resultDict[jobDict[job]] = 'Waiting' return S_OK(resultDict)
def getJobStatus(self, jobIDList): """ Get the status information for the given list of jobs """ resultDict = {} ssh = SSH(self.sshUser, self.sshHost, self.sshPassword) for jobList in breakListIntoChunks(jobIDList, 100): jobDict = {} for job in jobList: jobNumber = job.split('.')[0] if jobNumber: jobDict[jobNumber] = job cmd = ("source %s; qstat") % (self.geEnv) result = ssh.sshCall(10, cmd) if not result['OK']: return result output = result['Value'][1].replace('\r', '') lines = output.split('\n') for job in jobDict: resultDict[jobDict[job]] = 'Unknown' for line in lines: if line.find(job) != -1: if line.find('Unknown') != -1: resultDict[jobDict[job]] = 'Unknown' else: torqueStatus = line.split()[4] if torqueStatus in ['Tt', 'Tr']: resultDict[jobDict[job]] = 'Done' elif torqueStatus in ['Rr', 'r']: resultDict[jobDict[job]] = 'Running' elif torqueStatus in ['qw', 'h']: resultDict[jobDict[job]] = 'Waiting' else: if resultDict[jobDict[job]] == 'Unknown': cmd = ("ls -la %s/*%s*") % (self.batchOutput, job) result = ssh.sshCall(10, cmd) subS = ("No such file or directory") if subS in result['Value']: self.log.debug("Output no ready") else: resultDict[jobDict[job]] = 'Done' else: continue self.log.debug("Result dict: ") self.log.debug(resultDict) return S_OK(resultDict)
def registerCopiedFiles(self, filesNewlyCopied, copiedFiles, allUnmigratedFilesMeta): """ Register successfuly copied files (newly, or in Copied status in the DB) in the DFC. :param filesNewlyCopied: [lfns] of files newly copied :param copiedFiles: {lfn:RIDb metadata} of files that were in Copied state. :param allUnmigratedFilesMeta: {lfn:RI Db metadata} for all lfns non migrated at the beginning of the loop. :return: {lfn:True} for successfuly registered lfns """ if filesNewlyCopied or copiedFiles: self.log.info("Attempting to register %s newly copied and %s previously copied files" % (len(filesNewlyCopied), len(copiedFiles))) else: self.log.info("No files to be registered") # Update copiedFiles to also contain the newly copied files copiedFiles.update(dict((lfn, allUnmigratedFilesMeta[lfn]) for lfn in filesNewlyCopied)) successfulRegister = {} failedRegister = {} # Try to register them by batch for lfnChunk in breakListIntoChunks(copiedFiles, 100): # Add the metadata lfnDictChuck = dict((lfn, copiedFiles[lfn]) for lfn in lfnChunk) res = self.fileCatalog.addFile(lfnDictChuck) if not res['OK']: self.log.error("Completely failed to register some successfully copied file.", res['Message']) failedRegister.update(dict((lfn, res['Message']) for lfn in lfnDictChuck)) else: successfulRegister.update(res['Value']['Successful']) failedRegister.update(res['Value']['Failed']) gMonitor.addMark("ErrorRegister", len(failedRegister)) for lfn, reason in failedRegister.iteritems(): self.log.error("Failed to register lfn. Setting to Copied", "%s: %s" % (lfn, reason)) res = self.rawIntegrityDB.setFileStatus(lfn, 'Copied') if not res['OK']: self.log.error("Error setting file status to Copied", "%s: %s" % (lfn, res['Message'])) for lfn in successfulRegister: self.log.info("Successfully registered %s in the File Catalog." % lfn) return successfulRegister
def getJobStatus( self, jobIDList ): """ Get the status information for the given list of jobs """ resultDict = {} ssh = SSH( self.sshUser, self.sshHost, self.sshPassword ) for jobList in breakListIntoChunks(jobIDList,100): jobDict = {} for job in jobList: jobNumber = job.split('.')[0] if jobNumber: jobDict[jobNumber] = job cmd = ("source %s; qstat")%(self.geEnv) result = ssh.sshCall( 10, cmd ) if not result['OK']: return result output = result['Value'][1].replace( '\r', '' ) lines = output.split( '\n' ) for job in jobDict: resultDict[jobDict[job]] = 'Unknown' for line in lines: if line.find( job ) != -1: if line.find( 'Unknown' ) != -1: resultDict[jobDict[job]] = 'Unknown' else: torqueStatus = line.split()[4] if torqueStatus in ['Tt', 'Tr']: resultDict[jobDict[job]] = 'Done' elif torqueStatus in ['Rr', 'r']: resultDict[jobDict[job]] = 'Running' elif torqueStatus in ['qw', 'h']: resultDict[jobDict[job]] = 'Waiting' else: if resultDict[jobDict[job]] == 'Unknown': cmd = ("ls -la %s/*%s*")%(self.batchOutput,job) result = ssh.sshCall( 10, cmd ) subS = ("No such file or directory") if subS in result['Value']: self.log.debug ("Output no ready") else: resultDict[jobDict[job]] = 'Done' else: continue self.log.debug("Result dict: ") self.log.debug(resultDict) return S_OK( resultDict )
def prepareNewJobs(self, maxFilesPerJob=100, maxAttemptsPerFile=10): log = self._log.getSubLogger("_prepareNewJobs", child=True) filesToSubmit = self._getFilesToSubmit(maxAttemptsPerFile=maxAttemptsPerFile) log.debug("%s ftsFiles to submit" % len(filesToSubmit)) newJobs = [] # {targetSE : [FTS3Files] } res = FTS3Utilities.groupFilesByTarget(filesToSubmit) if not res['OK']: return res filesGroupedByTarget = res['Value'] for targetSE, ftsFiles in filesGroupedByTarget.iteritems(): res = self._checkSEAccess(targetSE, 'WriteAccess', vo=self.vo) if not res['OK']: # If the SE is currently banned, we just skip it if cmpError(res, errno.EACCES): log.info("Write access currently not permitted to %s, skipping." % targetSE) else: log.error(res) for ftsFile in ftsFiles: ftsFile.attempt += 1 continue sourceSEs = self.sourceSEs.split(',') if self.sourceSEs is not None else [] # { sourceSE : [FTSFiles] } res = FTS3Utilities.selectUniqueRandomSource(ftsFiles, allowedSources=sourceSEs) if not res['OK']: return res uniqueTransfersBySource = res['Value'] # We don't need to check the source, since it is already filtered by the DataManager for sourceSE, ftsFiles in uniqueTransfersBySource.iteritems(): for ftsFilesChunk in breakListIntoChunks(ftsFiles, maxFilesPerJob): newJob = self._createNewJob('Transfer', ftsFilesChunk, targetSE, sourceSE=sourceSE) newJobs.append(newJob) return S_OK(newJobs)
def cleanOutputs(self, jobInfo): """Remove all job outputs for job represented by jobInfo object. Including removal of descendents, if defined. """ if len(jobInfo.outputFiles) == 0: return descendants = self.__findAllDescendants(jobInfo.outputFiles) existingOutputFiles = [ lfn for lfn, status in izip_longest(jobInfo.outputFiles, jobInfo.outputFileStatus) if status == "Exists" ] filesToDelete = existingOutputFiles + descendants if not filesToDelete: return if not self.enabled: self.log.notice("Would have removed these files: \n +++ %s " % "\n +++ ".join(filesToDelete)) return self.log.notice("Remove these files: \n +++ %s " % "\n +++ ".join(filesToDelete)) errorReasons = defaultdict(list) successfullyRemoved = 0 for lfnList in breakListIntoChunks(filesToDelete, 200): with UserProxy(proxyUserDN=self.authorDN, proxyUserGroup=self.authorGroup) as proxyResult: if not proxyResult['OK']: raise RuntimeError('Failed to get a proxy: %s' % proxyResult['Message']) result = DataManager().removeFile(lfnList) if not result['OK']: self.log.error("Failed to remove LFNs", result['Message']) raise RuntimeError("Failed to remove LFNs: %s" % result['Message']) for lfn, err in result['Value']['Failed'].items(): reason = str(err) errorReasons[reason].append(lfn) successfullyRemoved += len( result['Value']['Successful'].keys()) for reason, lfns in errorReasons.items(): self.log.error("Failed to remove %d files with error: %s" % (len(lfns), reason)) self.log.notice("Successfully removed %d files" % successfullyRemoved)
def cleanOutputs(self, jobInfo): """remove all job outputs""" if len(jobInfo.outputFiles) == 0: return descendants = self.__findAllDescendants(jobInfo.outputFiles) existingOutputFiles = [ lfn for lfn, status in izip_longest(jobInfo.outputFiles, jobInfo.outputFileStatus) if status == "Exists" ] filesToDelete = existingOutputFiles + descendants if not filesToDelete: return if not self.enabled: self.log.notice("Would have removed these files: \n +++ %s " % "\n +++ ".join(filesToDelete)) return self.log.notice("Remove these files: \n +++ %s " % "\n +++ ".join(filesToDelete)) errorReasons = {} successfullyRemoved = 0 for lfnList in breakListIntoChunks(filesToDelete, 200): ## this is needed to remove the file with the Shifter credentials and not with the server credentials gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false') result = DataManager().removeFile(lfnList) gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true') if not result['OK']: self.log.error("Failed to remove LFNs", result['Message']) raise RuntimeError("Failed to remove LFNs: %s" % result['Message']) for lfn, err in result['Value']['Failed'].items(): reason = str(err) if reason not in errorReasons.keys(): errorReasons[reason] = [] errorReasons[reason].append(lfn) successfullyRemoved += len(result['Value']['Successful'].keys()) for reason, lfns in errorReasons.items(): self.log.error("Failed to remove %d files with error: %s" % (len(lfns), reason)) self.log.notice("Successfully removed %d files" % successfullyRemoved)
def removeFile( self, lfn, rpc = '', url = '', timeout = None ): res = self.__checkArgumentFormat( lfn ) if not res['OK']: return res lfns = res['Value'].keys() rpcClient = self._getRPC( rpc = rpc, url = url, timeout = timeout ) successful = {} failed = {} listOfLists = breakListIntoChunks( lfns, 100 ) for fList in listOfLists: res = rpcClient.removeFile( fList ) if not res['OK']: return res successful.update( res['Value']['Successful'] ) failed.update( res['Value']['Failed'] ) resDict = {'Successful': successful, 'Failed':failed} return S_OK( resDict )
def _groupByReplicas( self ): """ Generates a job based on the location of the input data """ if not self.params: return S_ERROR( "TransformationPlugin._Standard: The 'Standard' plug-in requires parameters." ) status = self.params['Status'] groupSize = self.params['GroupSize'] # Group files by SE fileGroups = self._getFileGroups( self.data ) # Create tasks based on the group size tasks = [] for replicaSE in sortList( fileGroups.keys() ): lfns = fileGroups[replicaSE] tasksLfns = breakListIntoChunks( lfns, groupSize ) for taskLfns in tasksLfns: if ( status == 'Flush' ) or ( len( taskLfns ) >= int( groupSize ) ): tasks.append( ( replicaSE, taskLfns ) ) return S_OK( tasks )
def __setHasReplicaFlag(self, lfns): server = RPCClient(self.url, timeout=120) successful = {} failed = {} for lfnList in breakListIntoChunks(lfns, self.splitSize): res = server.addFiles(lfnList) if not res["OK"]: for lfn in lfnList: failed[lfn] = res["Message"] else: for lfn in lfnList: if res["Value"].has_key(lfn): failed[lfn] = res["Value"][lfn] else: successful[lfn] = True resDict = {"Successful": successful, "Failed": failed} return S_OK(resDict)
def __unsetHasReplicaFlag( self, lfns ): server = RPCClient( self.url, timeout = 120 ) successful = {} failed = {} for lfnList in breakListIntoChunks( lfns, self.splitSize ): res = server.removeFiles( lfnList ) if not res['OK']: for lfn in lfnList: failed[lfn] = res['Message'] else: for lfn in lfnList: if res['Value'].has_key( lfn ): failed[lfn] = res['Value'][lfn] else: successful[lfn] = True resDict = {'Successful':successful, 'Failed':failed} return S_OK( resDict )
def _findFileIDs( self, lfns, connection=False ): """ Find lfn <-> FileID correspondence """ connection = self._getConnection(connection) dirDict = self._getFileDirectories(lfns) failed = {} successful = {} result = self.db.dtree.findDirs( dirDict.keys() ) if not result['OK']: return result directoryIDs = result['Value'] directoryPaths = {} for dirPath in dirDict: if not dirPath in directoryIDs: for fileName in dirDict[dirPath]: fname = '%s/%s' % (dirPath,fileName) fname = fname.replace('//','/') failed[fname] = 'No such file or directory' else: directoryPaths[directoryIDs[dirPath]] = dirPath directoryIDList = directoryIDs.keys() for dirIDs in breakListIntoChunks( directoryIDList, 1000 ): wheres = [] for dirPath in dirIDs: fileNames = dirDict[dirPath] dirID = directoryIDs[dirPath] wheres.append( "( DirID=%d AND FileName IN (%s) )" % (dirID, stringListToString(fileNames) ) ) req = "SELECT FileName,DirID,FileID FROM FC_Files WHERE %s" % " OR ".join( wheres ) result = self.db._query(req,connection) if not result['OK']: return result for fileName, dirID, fileID in result['Value']: fname = '%s/%s' % (directoryPaths[dirID],fileName) fname = fname.replace('//','/') successful[fname] = fileID for lfn in lfns: if not lfn in successful: failed[lfn] = "No such file or directory" return S_OK({"Successful":successful,"Failed":failed})
def __getFileMetadata(self, lfns): server = RPCClient(self.url, timeout=120) successful = {} failed = {} for lfnList in breakListIntoChunks(lfns, self.splitSize): res = server.getFileMetadata(lfnList) if not res["OK"]: for lfn in lfnList: failed[lfn] = res["Message"] else: for lfn in lfnList: if not lfn in res["Value"].keys(): failed[lfn] = "File does not exist" elif res["Value"][lfn] in types.StringTypes: failed[lfn] = res["Value"][lfn] else: successful[lfn] = res["Value"][lfn] resDict = {"Successful": successful, "Failed": failed} return S_OK(resDict)
def removeReplica( self, lfn, rpc = '', url = '', timeout = 120 ): res = self.__checkArgumentFormat( lfn ) if not res['OK']: return res tuples = [] for lfn, info in res['Value'].items(): tuples.append( ( lfn, info['PFN'], info['SE'] ) ) rpcClient = self._getRPC( rpc = rpc, url = url, timeout = timeout ) successful = {} failed = {} listOfLists = breakListIntoChunks( tuples, 100 ) for fList in listOfLists: res = rpcClient.removeReplica( fList ) if not res['OK']: return res successful.update( res['Value']['Successful'] ) failed.update( res['Value']['Failed'] ) resDict = {'Successful': successful, 'Failed':failed} return S_OK( resDict )
def __getFileMetadata( self, lfns ): server = RPCClient( self.url, timeout = 120 ) successful = {} failed = {} for lfnList in breakListIntoChunks( lfns, self.splitSize ): res = server.getFileMetadata( lfnList ) if not res['OK']: for lfn in lfnList: failed[lfn] = res['Message'] else: for lfn in lfnList: if not lfn in res['Value'].keys(): failed[lfn] = 'File does not exist' elif res['Value'][lfn] in types.StringTypes: failed[lfn] = res['Value'][lfn] else: successful[lfn] = res['Value'][lfn] resDict = {'Successful':successful, 'Failed':failed} return S_OK( resDict )
def setPendingRequests(self, jobs): """Loop over all the jobs and get requests, if any.""" for jobChunk in breakListIntoChunks(jobs.values(), 1000): jobIDs = [job.jobID for job in jobChunk] while True: result = self.reqClient.readRequestsForJobs(jobIDs) if result['OK']: break self.log.error('Failed to read requests', result['Message']) # repeat for jobID in result['Value']['Successful']: request = result['Value']['Successful'][jobID] requestID = request.RequestID dbStatus = self.reqClient.getRequestStatus(requestID).get('Value', 'Unknown') for job in jobChunk: if job.jobID == jobID: job.pendingRequest = dbStatus not in ('Done', 'Canceled') self.log.notice('Found %s request for job %d' % ('pending' if job.pendingRequest else 'finished', jobID)) break
def _getFileLFNs(self, fileIDs): """ Get the file LFNs for a given list of file IDs We need to override this method because the base class hard codes the column names """ successful = {} for chunks in breakListIntoChunks(fileIDs, 1000): # Format the filenames and status to be used in a IN clause in the sotred procedure formatedFileIds = intListToString(chunks) result = self.db.executeStoredProcedureWithCursor( 'ps_get_full_lfn_for_file_ids', (formatedFileIds, )) if not result['OK']: return result # The result contains FileID, LFN for row in result['Value']: successful[row[0]] = row[1] missingIds = set(fileIDs) - set(successful) failed = dict.fromkeys(missingIds, "File ID not found") return S_OK({'Successful': successful, 'Failed': failed})
def getLFNStatus(self, jobs): """Get all the LFNs for the jobs and get their status.""" self.log.notice('Collecting LFNs...') lfnExistence = {} lfnCache = [] for counter, job in enumerate(jobs.values()): if counter % self.printEveryNJobs == 0: self.log.notice('Getting JobInfo: %d/%d: %3.1fs' % (counter, len(jobs), float(time.time() - self.startTime))) while True: try: job.getJobInformation(self.diracILC) if job.inputFile: lfnCache.append(job.inputFile) if job.outputFiles: lfnCache.extend(job.outputFiles) break except RuntimeError as e: # try again self.log.error('+++++ Failure for job:', job.jobID) self.log.error('+++++ Exception: ', str(e)) counter = 0 for lfnChunk in breakListIntoChunks(list(lfnCache), 200): counter += 200 if counter % 1000 == 0: self.log.notice('Getting FileInfo: %d/%d: %3.1fs' % (counter, len(jobs), float(time.time() - self.startTime))) while True: try: reps = self.fcClient.exists(lfnChunk) if not reps['OK']: self.log.error('Failed to check file existence, try again...', reps['Message']) raise RuntimeError('Try again') statuses = reps['Value'] lfnExistence.update(statuses['Successful']) break except RuntimeError: # try again pass return lfnExistence
def getJobStatus( self, jobIDList ): """ Get the status information for the given list of jobs """ resultDict = {} ssh = SSH( parameters = self.ceParameters ) for jobList in breakListIntoChunks( jobIDList, 100 ): jobDict = {} for job in jobList: result = pfnparse( job ) jobNumber = result['Value']['FileName'] if jobNumber: jobDict[jobNumber] = job jobStamps = jobDict.keys() cmd = [ 'bjobs', ' '.join( jobStamps ) ] result = ssh.sshCall( 100, cmd ) if not result['OK']: return result output = result['Value'][1].replace( '\r', '' ) lines = output.split( '\n' ) for job in jobDict: resultDict[jobDict[job]] = 'Unknown' for line in lines: if line.find( job ) != -1: if line.find( 'UNKWN' ) != -1: resultDict[jobDict[job]] = 'Unknown' else: lsfStatus = line.split()[2] if lsfStatus in ['DONE', 'EXIT']: resultDict[jobDict[job]] = 'Done' elif lsfStatus in ['RUN', 'SSUSP']: resultDict[jobDict[job]] = 'Running' elif lsfStatus in ['PEND', 'PSUSP']: resultDict[jobDict[job]] = 'Waiting' return S_OK( resultDict )
def _getJobStatusOnHost( self, jobIDList, host = None ): """ Get the status information for the given list of jobs """ resultDict = {} jobDict = {} for job in jobIDList: stamp = os.path.basename( urlparse( job ).path ) jobDict[stamp] = job stampList = jobDict.keys() for jobList in breakListIntoChunks( stampList, 100 ): resultCommand = self.__executeHostCommand( 'getJobStatus', { 'JobIDList': jobList }, host = host ) if not resultCommand['OK']: return resultCommand result = resultCommand['Value'] if result['Status'] != 0: return S_ERROR( 'Failed to get job status: %s' % result['Message'] ) resultDict.update( result['Jobs'] ) return S_OK( resultDict )
def getJobStatus( self, jobIDList ): """ Get the status information for the given list of jobs """ resultDict = {} ssh = SSH( self.sshUser, self.sshHost, self.sshPassword ) for jobList in breakListIntoChunks(jobIDList,100): jobDict = {} for job in jobList: jobNumber = job.split('.')[0] if jobNumber: jobDict[jobNumber] = job cmd = [ 'qstat', ' '.join( jobList ) ] result = ssh.sshCall( 10, cmd ) if not result['OK']: return result output = result['Value'][1].replace( '\r', '' ) lines = output.split( '\n' ) for job in jobDict: resultDict[jobDict[job]] = 'Unknown' for line in lines: if line.find( job ) != -1: if line.find( 'Unknown' ) != -1: resultDict[jobDict[job]] = 'Unknown' else: torqueStatus = line.split()[4] if torqueStatus in ['E', 'C']: resultDict[jobDict[job]] = 'Done' elif torqueStatus in ['R']: resultDict[jobDict[job]] = 'Running' elif torqueStatus in ['S', 'W', 'Q', 'H', 'T']: resultDict[jobDict[job]] = 'Waiting' return S_OK( resultDict )
def _BroadcastProcessed( self ): """ this plug-in only creates tasks for files which have descendents """ transformationStatus = self.params['Status'] if transformationStatus in ('Flush', ): self.util.logInfo( "Flushing transformation, passing all files on" ) return self._Broadcast() inputFiles = self.data self.util.logInfo( "Number of input files before selection: %d " % len( inputFiles ) ) ## query only a maximum of 200 files in one go inputFileLists = breakListIntoChunks( inputFiles.keys(), 200 ) for ifList in inputFileLists: resDesc = self.util.fc.getFileDescendents( ifList, depths=1 ) self.util.logDebug( "Result from getFileDescendents: %s " % resDesc ) if not resDesc['OK']: return resDesc descendents = resDesc['Value'] for lfn in ifList: if lfn not in descendents['Successful']: self.util.logDebug( "Removed: %s, not in succesful " % lfn ) inputFiles.pop( lfn, None ) elif not descendents['Successful'][lfn]: self.util.logDebug( "Removed: %s no descendents" % lfn ) inputFiles.pop( lfn, None ) if descendents['Failed']: self.util.logWarn("Failed getDescendents: %s " % descendents['Failed']) self.util.logInfo( "Number of input files after selection: %d " % len( inputFiles ) ) self.data = inputFiles return self._Broadcast()