def execute( self, dataToResolve = None ): """This method is called to obtain the TURLs for all requested input data firstly by available site protocols and redundantly via TURL construction. If TURLs are missing these are conveyed in the result to """ # Define local configuration options present at every site localSEList = self.configuration['LocalSEList'] self.jobID = self.configuration.get( 'JobID' ) allReplicas = self.configuration.get( 'AllReplicas', False ) if allReplicas: self.log.info( 'All replicas will be used in the resolution' ) if dataToResolve: self.log.verbose( 'Data to resolve passed directly to InputDataByProtocol module' ) self.inputData = dataToResolve # e.g. list supplied by another module self.inputData = [x.replace( 'LFN:', '' ) for x in self.inputData] self.log.verbose( 'InputData requirement to be resolved by protocol is:\n%s' % '\n'.join( self.inputData ) ) # First make a check in case replicas have been removed or are not accessible # from the local site (remove these from consideration for local protocols) replicas = self.fileCatalogResult['Value']['Successful'] self.log.debug( 'File Catalogue result is:\n%s' % str( replicas ) ) # First get the preferred replica: result = self.__resolveReplicas( localSEList, replicas ) if not result['OK']: return result success = result['Successful'] if not allReplicas: bestReplica = {} for lfn in success: bestReplica[lfn] = success[lfn][0] ret = S_OK() ret.update( {'Successful': bestReplica, 'Failed':result['Failed']} ) return ret # If all replicas are requested, get results for other SEs seList = set() localSESet = set( localSEList ) for lfn in replicas.keys(): extraSEs = set( replicas[lfn] ) - localSESet # If any extra SE, add it to the set, othewise don't consider that file if extraSEs: seList.update( extraSEs ) else: replicas.pop( lfn ) seList -= self.metaKeys if seList: result = self.__resolveReplicas( seList, replicas, ignoreTape = True ) if not result['OK']: return result for lfn in result['Successful']: success.setdefault( lfn, [] ).extend( result['Successful'][lfn] ) # Only consider failed the files that are not successful as well failed = [lfn for lfn in result['Failed'] if lfn not in success] return S_OK( {'Successful': success, 'Failed':failed} )
def available(self, jobIDList=None): """This method returns the number of available slots in the target CE. The CE instance polls for waiting and running jobs and compares to the limits in the CE parameters. :param jobIDList: list of already existing job IDs to be checked against :type jobIDList: python:list """ # If there are no already registered jobs if jobIDList is not None and not jobIDList: result = S_OK() result['RunningJobs'] = 0 result['WaitingJobs'] = 0 result['SubmittedJobs'] = 0 else: result = self.ceParameters.get('CEType') if result and result == 'CREAM': result = self.getCEStatus(jobIDList) else: result = self.getCEStatus() if not result['OK']: return result runningJobs = result['RunningJobs'] waitingJobs = result['WaitingJobs'] submittedJobs = result['SubmittedJobs'] availableProcessors = result.get('AvailableProcessors') ceInfoDict = dict(result) maxTotalJobs = int(self.ceParameters.get('MaxTotalJobs', 0)) ceInfoDict['MaxTotalJobs'] = maxTotalJobs waitingToRunningRatio = float(self.ceParameters.get('WaitingToRunningRatio', 0.0)) # if there are no Running job we can submit to get at most 'MaxWaitingJobs' # if there are Running jobs we can increase this to get a ratio W / R 'WaitingToRunningRatio' maxWaitingJobs = int(max(int(self.ceParameters.get('MaxWaitingJobs', 0)), runningJobs * waitingToRunningRatio)) self.log.verbose('Max Number of Jobs:', maxTotalJobs) self.log.verbose('Max W/R Ratio:', waitingToRunningRatio) self.log.verbose('Max Waiting Jobs:', maxWaitingJobs) # Determine how many more jobs can be submitted message = '%s CE: SubmittedJobs=%s' % (self.ceName, submittedJobs) message += ', WaitingJobs=%s, RunningJobs=%s' % (waitingJobs, runningJobs) totalJobs = runningJobs + waitingJobs message += ', MaxTotalJobs=%s' % (maxTotalJobs) if totalJobs >= maxTotalJobs: self.log.verbose('Max Number of Jobs reached:', maxTotalJobs) result['Value'] = 0 message = 'There are %s waiting jobs and total jobs %s >= %s max total jobs' % ( waitingJobs, totalJobs, maxTotalJobs) else: additionalJobs = 0 if waitingJobs < maxWaitingJobs: additionalJobs = maxWaitingJobs - waitingJobs if totalJobs + additionalJobs >= maxTotalJobs: additionalJobs = maxTotalJobs - totalJobs # For SSH CE case if int(self.ceParameters.get('MaxWaitingJobs', 0)) == 0: additionalJobs = maxTotalJobs - runningJobs if availableProcessors is not None: additionalJobs = min(additionalJobs, availableProcessors) result['Value'] = additionalJobs result['Message'] = message result['CEInfoDict'] = ceInfoDict return result
def storeProxy( self, userDN, userGroup, chain ): """ Store user proxy into the Proxy repository for a user specified by his DN and group. """ retVal = Registry.getUsernameForDN( userDN ) if not retVal[ 'OK' ]: return retVal userName = retVal[ 'Value' ] #Get remaining secs retVal = chain.getRemainingSecs() if not retVal[ 'OK' ]: return retVal remainingSecs = retVal[ 'Value' ] if remainingSecs < self._minSecsToAllowStore: return S_ERROR( "Cannot store proxy, remaining secs %s is less than %s" % ( remainingSecs, self._minSecsToAllowStore ) ) #Compare the DNs retVal = chain.getIssuerCert() if not retVal[ 'OK' ]: return retVal proxyIdentityDN = retVal[ 'Value' ].getSubjectDN()[ 'Value' ] if not userDN == proxyIdentityDN: msg = "Mismatch in the user DN" vMsg = "Proxy says %s and credentials are %s" % ( proxyIdentityDN, userDN ) self.log.error( msg, vMsg ) return S_ERROR( "%s. %s" % ( msg, vMsg ) ) #Check the groups retVal = chain.getDIRACGroup() if not retVal[ 'OK' ]: return retVal proxyGroup = retVal[ 'Value' ] if not proxyGroup: proxyGroup = Registry.getDefaultUserGroup() if not userGroup == proxyGroup: msg = "Mismatch in the user group" vMsg = "Proxy says %s and credentials are %s" % ( proxyGroup, userGroup ) self.log.error( msg, vMsg ) return S_ERROR( "%s. %s" % ( msg, vMsg ) ) #Check if its limited if chain.isLimitedProxy()['Value']: return S_ERROR( "Limited proxies are not allowed to be stored" ) dLeft = remainingSecs / 86400 hLeft = remainingSecs / 3600 - dLeft * 24 mLeft = remainingSecs / 60 - hLeft * 60 - dLeft * 1440 sLeft = remainingSecs - hLeft * 3600 - mLeft * 60 - dLeft * 86400 self.log.info( "Storing proxy for credentials %s (%d:%02d:%02d:%02d left)" % ( proxyIdentityDN, dLeft, hLeft, mLeft, sLeft ) ) try: sUserDN = self._escapeString( userDN )[ 'Value' ] sUserGroup = self._escapeString( userGroup )[ 'Value' ] except KeyError: return S_ERROR( "Cannot escape DN" ) # Check what we have already got in the repository cmd = "SELECT TIMESTAMPDIFF( SECOND, UTC_TIMESTAMP(), ExpirationTime ), Pem FROM `ProxyDB_Proxies` WHERE UserDN=%s AND UserGroup=%s" % ( sUserDN, sUserGroup ) result = self._query( cmd ) if not result['OK']: return result # check if there is a previous ticket for the DN data = result[ 'Value' ] sqlInsert = True if len( data ) > 0: sqlInsert = False pem = data[0][1] if pem: remainingSecsInDB = data[0][0] if remainingSecs <= remainingSecsInDB: self.log.info( "Proxy stored is longer than uploaded, omitting.", "%s in uploaded, %s in db" % ( remainingSecs, remainingSecsInDB ) ) return S_OK() pemChain = chain.dumpAllToString()['Value'] dValues = { 'UserName' : self._escapeString( userName )[ 'Value' ], 'UserDN' : sUserDN, 'UserGroup' : sUserGroup, 'Pem' : self._escapeString( pemChain )[ 'Value' ], 'ExpirationTime' : 'TIMESTAMPADD( SECOND, %d, UTC_TIMESTAMP() )' % int( remainingSecs ), 'PersistentFlag' : "'False'" } if sqlInsert: sqlFields = [] sqlValues = [] for key in dValues: sqlFields.append( key ) sqlValues.append( dValues[ key ] ) cmd = "INSERT INTO `ProxyDB_Proxies` ( %s ) VALUES ( %s )" % ( ", ".join( sqlFields ), ", ".join( sqlValues ) ) else: sqlSet = [] sqlWhere = [] for k in dValues: if k in ( 'UserDN', 'UserGroup' ): sqlWhere.append( "%s = %s" % ( k, dValues[k] ) ) else: sqlSet.append( "%s = %s" % ( k, dValues[k] ) ) cmd = "UPDATE `ProxyDB_Proxies` SET %s WHERE %s" % ( ", ".join( sqlSet ), " AND ".join( sqlWhere ) ) self.logAction( "store proxy", userDN, userGroup, userDN, userGroup ) return self._update( cmd )
def __uploadInputSandbox(self, classAdJob, jobDescriptionObject=None): """Checks the validity of the job Input Sandbox. The function returns the list of Input Sandbox files. The total volume of the input sandbox is evaluated """ inputSandbox = self.__getInputSandboxEntries(classAdJob) realFiles = [] badFiles = [] diskFiles = [] for isFile in inputSandbox: if not isFile.startswith(('lfn:', 'LFN:', 'SB:', '%s', '%(')): realFiles.append(isFile) stringIOFiles = [] stringIOFilesSize = 0 if jobDescriptionObject is not None: if isinstance(jobDescriptionObject, StringIO): stringIOFiles = [jobDescriptionObject] stringIOFilesSize = len(jobDescriptionObject.getvalue()) gLogger.debug("Size of the stringIOFiles: " + str(stringIOFilesSize)) else: return S_ERROR( EWMSJDL, "jobDescriptionObject is not a StringIO object") # Check real files for isFile in realFiles: if not os.path.exists( isFile ): # we are passing in real files, we expect them to be on disk badFiles.append(isFile) gLogger.warn("inputSandbox file/directory " + isFile + " not found. Keep looking for the others") continue diskFiles.append(isFile) diskFilesSize = File.getGlobbedTotalSize(diskFiles) gLogger.debug("Size of the diskFiles: " + str(diskFilesSize)) totalSize = diskFilesSize + stringIOFilesSize gLogger.verbose("Total size of the inputSandbox: " + str(totalSize)) okFiles = stringIOFiles + diskFiles if badFiles: result = S_ERROR(EWMSJDL, 'Input Sandbox is not valid') result['BadFile'] = badFiles result['TotalSize'] = totalSize return result if okFiles: if not self.sandboxClient: self.sandboxClient = SandboxStoreClient( useCertificates=self.useCertificates, delegatedDN=self.delegatedDN, delegatedGroup=self.delegatedGroup) result = self.sandboxClient.uploadFilesAsSandbox(okFiles) if not result['OK']: return result inputSandbox.append(result['Value']) classAdJob.insertAttributeVectorString("InputSandbox", inputSandbox) return S_OK()
def getQueues(siteList=None, ceList=None, ceTypeList=None, community=None, mode=None): """ Get CE/queue options according to the specified selection """ result = gConfig.getSections('/Resources/Sites') if not result['OK']: return result resultDict = {} grids = result['Value'] for grid in grids: result = gConfig.getSections('/Resources/Sites/%s' % grid) if not result['OK']: continue sites = result['Value'] for site in sites: if siteList is not None and not site in siteList: continue if community: comList = gConfig.getValue( '/Resources/Sites/%s/%s/VO' % (grid, site), []) if comList and not community in comList: continue siteCEParameters = {} result = gConfig.getOptionsDict('/Resources/Sites/%s/%s/CEs' % (grid, site)) if result['OK']: siteCEParameters = result['Value'] result = gConfig.getSections('/Resources/Sites/%s/%s/CEs' % (grid, site)) if not result['OK']: continue ces = result['Value'] for ce in ces: if mode: ceMode = gConfig.getValue( '/Resources/Sites/%s/%s/CEs/%s/SubmissionMode' % (grid, site, ce), 'Direct') if not ceMode or ceMode != mode: continue if ceTypeList: ceType = gConfig.getValue( '/Resources/Sites/%s/%s/CEs/%s/CEType' % (grid, site, ce), '') if not ceType or not ceType in ceTypeList: continue if ceList is not None and not ce in ceList: continue if community: comList = gConfig.getValue( '/Resources/Sites/%s/%s/CEs/%s/VO' % (grid, site, ce), []) if comList and not community in comList: continue ceOptionsDict = dict(siteCEParameters) result = gConfig.getOptionsDict( '/Resources/Sites/%s/%s/CEs/%s' % (grid, site, ce)) if not result['OK']: continue ceOptionsDict.update(result['Value']) result = gConfig.getSections( '/Resources/Sites/%s/%s/CEs/%s/Queues' % (grid, site, ce)) if not result['OK']: continue queues = result['Value'] for queue in queues: if community: comList = gConfig.getValue( '/Resources/Sites/%s/%s/CEs/%s/Queues/%s/VO' % (grid, site, ce, queue), []) if comList and not community in comList: continue resultDict.setdefault(site, {}) resultDict[site].setdefault(ce, ceOptionsDict) resultDict[site][ce].setdefault('Queues', {}) result = gConfig.getOptionsDict( '/Resources/Sites/%s/%s/CEs/%s/Queues/%s' % (grid, site, ce, queue)) if not result['OK']: continue queueOptionsDict = result['Value'] resultDict[site][ce]['Queues'][queue] = queueOptionsDict return S_OK(resultDict)
def physicalRemoval(self, index, requestObj, subRequestAttrs, subRequestFiles): """ action for 'physicalRemoval' operation :param self: self reference :param int index: subRequest index in execution order :param RequestContainer requestObj: request :param dict subRequestAttrs: subRequest's attributes :param dict subRequestFiles: subRequest's files """ self.info("physicalRemoval: processing subrequest %s" % index) if requestObj.isSubRequestEmpty(index, "removal")["Value"]: self.info( "physicalRemoval: subrequest %s is empty, setting its status to 'Done'" % index) requestObj.setSubRequestStatus(index, "removal", "Done") return S_OK(requestObj) targetSEs = list( set([ targetSE.strip() for targetSE in subRequestAttrs["TargetSE"].split(",") if targetSE.strip() ])) pfns = [] pfnToLfn = {} for subRequestFile in subRequestFiles: if subRequestFile["Status"] == "Waiting": pfn = subRequestFile["PFN"] lfn = subRequestFile["LFN"] pfnToLfn[pfn] = lfn pfns.append(pfn) failed = {} errors = {} self.addMark('PhysicalRemovalAtt', len(pfns)) for targetSE in targetSEs: remove = self.replicaManager().removeStorageFile(pfns, targetSE) if remove["OK"]: for pfn in remove["Value"]["Failed"]: if pfn not in failed: failed[pfn] = {} failed[pfn][targetSE] = remove["Value"]["Failed"][pfn] else: errors[targetSE] = remove["Message"] for pfn in pfns: if pfn not in failed: failed[pfn] = {} failed[pfn][targetSE] = "Completely" failedPFNs = failed.keys() pfnsOK = [pfn for pfn in pfns if pfn not in failedPFNs] self.addMark("PhysicalRemovalDone", len(pfnsOK)) for pfn in pfnsOK: self.info("physicalRemoval: succesfully removed %s from %s" % (pfn, str(targetSEs))) res = requestObj.setSubRequestFileAttributeValue( index, "removal", pfnToLfn[pfn], "Status", "Done") if not res["OK"]: self.error( "physicalRemoval: error setting status to 'Done' for %s" % pfnToLfn[pfn]) if failed: self.addMark("PhysicalRemovalFail", len(failedPFNs)) for pfn in failed: for targetSE in failed[pfn]: if type(failed[pfn][targetSE]) in StringTypes: if re.search("no such file or directory", failed[pfn][targetSE].lower()): self.info( "physicalRemoval: file %s did not exist" % pfn) res = requestObj.setSubRequestFileAttributeValue( index, "removal", pfnToLfn[pfn], "Status", "Done") if not res["OK"]: self.error( "physicalRemoval: error setting status to 'Done' for %s" % pfnToLfn[pfn]) if errors: for targetSE in errors: self.warn( "physicalRemoval: completely failed to remove files at %s" % targetSE) # # subrequest empty or all Files done? if requestObj.isSubRequestDone(index, "removal")["Value"]: self.info( "physicalRemoval: all files processed, setting subrequest status to 'Done'" ) requestObj.setSubRequestStatus(index, "removal", "Done") return S_OK(requestObj)
def replicaRemoval(self, index, requestObj, subRequestAttrs, subRequestFiles): """ action for 'replicaRemoval' operation :param self: self reference :param int index: subRequest index in execution order :param RequestContainer requestObj: request :param dict subRequestAttrs: subRequest's attributes :param dict subRequestFiles: subRequest's files TODO: add bulk removal first """ self.info("replicaRemoval: processing subrequest %s" % index) if requestObj.isSubRequestEmpty(index, "removal")["Value"]: self.info( "replicaRemoval: subrequest %s is empty, setting its status to 'Done'" % index) requestObj.setSubRequestStatus(index, "removal", "Done") return S_OK(requestObj) targetSEs = list( set([ targetSE.strip() for targetSE in subRequestAttrs["TargetSE"].split(",") if targetSE.strip() ])) lfns = [ str(subRequestFile["LFN"]) for subRequestFile in subRequestFiles if subRequestFile["Status"] == "Waiting" and str(subRequestFile["LFN"]) ] self.debug( "replicaRemoval: found %s lfns to delete from %s sites (%s replicas)" % (len(lfns), len(targetSEs), len(lfns) * len(targetSEs))) self.addMark("ReplicaRemovalAtt", len(lfns) * len(targetSEs)) removalStatus = {} # # loop over LFNs for lfn in lfns: self.info("replicaRemoval: processing file %s" % lfn) # # prepare status dict removalStatus[lfn] = dict.fromkeys(targetSEs, "") # # loop over targetSEs try: for targetSE in targetSEs: # # try to remove using current proxy removeReplica = self.replicaManager().removeReplica( targetSE, lfn) # # file is not existing? if not removeReplica[ "OK"] and "no such file or directory" in str( removeReplica["Message"]).lower(): removalStatus[lfn][targetSE] = removeReplica["Message"] continue # # not OK but request belongs to DataManager? if not self.requestOwnerDN and \ ( not removeReplica["OK"] and "Write access not permitted for this credential." in removeReplica["Message"] ) or \ ( removeReplica["OK"] and "Failed" in removeReplica["Value"] and lfn in removeReplica["Value"]["Failed"] and "permission denied" in str( removeReplica["Value"]["Failed"][lfn] ).lower() ): # # get proxy for LFN getProxyForLFN = self.getProxyForLFN(lfn) # # can't get correct proxy? if not getProxyForLFN["OK"]: self.warn( "replicaRemoval: unable to get proxy for file %s: %s" % (lfn, getProxyForLFN["Message"])) removeReplica = getProxyForLFN else: # # got correct proxy? try to remove again removeReplica = self.replicaManager( ).removeReplica(targetSE, lfn) if not removeReplica["OK"]: removalStatus[lfn][targetSE] = removeReplica["Message"] continue removeReplica = removeReplica["Value"] # # check failed status for missing files if lfn in removeReplica["Failed"]: removalStatus[lfn][targetSE] = removeReplica["Failed"][ lfn] finally: # # make sure DataManager proxy is set back in place if not self.requestOwnerDN and self.dataManagerProxy(): # # remove temp proxy if os.environ["X509_USER_PROXY"] != self.dataManagerProxy( ): os.unlink(os.environ["X509_USER_PROXY"]) # # put back DataManager proxy os.environ["X509_USER_PROXY"] = self.dataManagerProxy() replicasRemoved = 0 replicasFailed = 0 subRequestError = [] # # filter out missing files for lfn, pTargetSEs in removalStatus.items(): for targetSE, error in pTargetSEs.items(): if "no such file or directory" in str(error).lower(): removalStatus[lfn][targetSE] = "" # # loop over statuses and errors for lfn, pTargetSEs in removalStatus.items(): failed = [(targetSE, error) for targetSE, error in pTargetSEs.items() if error != ""] successful = [(targetSE, error) for targetSE, error in pTargetSEs.items() if error == ""] replicasRemoved += len(successful) replicasFailed += len(failed) if not failed: self.info("replicaRemoval: successfully removed %s from %s" % (lfn, str(targetSEs))) updateStatus = requestObj.setSubRequestFileAttributeValue( index, "removal", lfn, "Status", "Done") if not updateStatus["OK"]: self.error( "replicaRemoval: error setting status to 'Done' for %s" % lfn) continue for targetSE, error in failed: self.warn("replicaRemoval: failed to remove %s from %s: %s" % (lfn, targetSE, error)) fileError = ";".join([ "%s:%s" % (targetSE, str(error).replace("'", "")) for targetSE, error in failed ])[:255] subRequestError.append(fileError) fileError = requestObj.setSubRequestFileAttributeValue( index, "removal", lfn, "Error", fileError) if not fileError["OK"]: self.error("replicaRemoval: unable to set Error for %s: %s" % (lfn, fileError["Message"])) self.addMark("ReplicaRemovalDone", replicasRemoved) self.addMark("ReplicaRemovalFail", replicasFailed) # # no 'Waiting' files or all 'Done' if requestObj.isSubRequestDone(index, "removal")["Value"]: self.info( "replicaRemoval: all files processed, setting subrequest status to 'Done'" ) requestObj.setSubRequestStatus(index, "removal", "Done") elif replicasFailed: self.info( "replicaRemoval: all files processed, failed to remove %s replicas" % replicasFailed) subRequestError = ";".join(subRequestError).replace("'", "")[:255] subRequestError = requestObj.setSubRequestAttributeValue( index, "removal", "Error", subRequestError) # # return requestObj at least return S_OK(requestObj)
def execute(self): """ The main agent execution method """ limitDate = date() - self._period limitDateString = toString(limitDate) tableList = [ "MessageRepository", "FixedTextMessages", "Systems", "SubSystems" ] columnsList = [ "SystemName", "SubSystemName", "count(*) as entries", "FixedTextString" ] cmd = "SELECT " + ', '.join( columnsList ) + " FROM " \ + " NATURAL JOIN ".join( tableList ) \ + " WHERE MessageTime > '%s'" % limitDate \ + " GROUP BY FixedTextString HAVING entries > %s" % self._threshold \ + " ORDER BY entries DESC LIMIT %i;" % self._limit result = self.SystemLoggingDB._query(cmd) if not result['OK']: return result messageList = result['Value'] if messageList == 'None' or messageList == (): self.log.warn('The DB query returned an empty result') return S_OK() mailBody = '\n' for message in messageList: mailBody = mailBody + "Count: "+str(message[2])+"\tError: '"\ + message[3] + "'\tSystem: '" + message[0]\ + "'\tSubsystem: '" + message[1] + "'\n" mailBody = mailBody + "\n\n-------------------------------------------------------\n"\ + "Please do not reply to this mail. It was automatically\n"\ + "generated by a Dirac Agent.\n" result = self.SystemLoggingDB._getDataFromAgentTable(self.agentName) self.log.debug(result) if not result['OK']: errorString = "Could not get the date when the last mail was sent" self.log.error(errorString) return S_ERROR(errorString) else: if len(result['Value']): self.log.debug("date value: %s" % fromString(result['Value'][0][0][1:-1])) lastMailSentDate = fromString(result['Value'][0][0][1:-1]) else: lastMailSentDate = limitDate - 1 * day result = self.SystemLoggingDB._insertDataIntoAgentTable( self.agentName, lastMailSentDate) if not result['OK']: errorString = "Could not insert data into the DB" self.log.error(errorString, result['Message']) return S_ERROR(errorString + ": " + result['Message']) self.log.debug("limitDate: %s\t" % limitDate \ + "lastMailSentDate: %s\n" % lastMailSentDate ) if lastMailSentDate > limitDate: self.log.info( "The previous report was sent less "\ +" than %s days ago" % self.__days ) return S_OK() dateSent = toString(date()) self.log.info("The list with the top errors has been sent") result = self.SystemLoggingDB._insertDataIntoAgentTable( self.agentName, dateSent) if not result['OK']: errorString = "Could not insert data into the DB" self.log.error(errorString, result['Message']) return S_ERROR(errorString + ": " + result['Message']) result = self.notification.sendMail(self._mailAddress, self._subject, mailBody) if not result['OK']: self.log.warn("The notification could not be sent") return S_OK() return S_OK("The list with the top errors has been sent")
def put(self, message, parameters=None): return S_OK("FakeMQConnection sending message: " + str(message))
def disconnect(self): return S_OK("FakeMQConnection disconnecting")
def initializeHandler(cls, serviceInfo): """ Handler initialization """ cls.upDB = UserProfileDB() return S_OK()
def _userjobmodules(self, stepdefinition): res1 = self._setApplicationModuleAndParameters(stepdefinition) res2 = self._setUserJobFinalization(stepdefinition) if not res1["OK"] or not res2["OK"]: return S_ERROR('userjobmodules failed') return S_OK()
def _setStepParametersValues(self, instance): self._setBaseStepParametersValues(instance) for depn, depv in self.dependencies.items(): self._job._addSoftware(depn, depv) return S_OK()
def _addParametersToStep(self, stepdefinition): res = self._addBaseParameters(stepdefinition) if not res["OK"]: return S_ERROR("Failed to set base parameters") return S_OK()
def _prodjobmodules(self, stepdefinition): res1 = self._setApplicationModuleAndParameters(stepdefinition) res2 = self._setOutputComputeDataList(stepdefinition) if not res1["OK"] or not res2["OK"]: return S_ERROR('prodjobmodules failed') return S_OK()
def __resolveReplicas( self, seList, replicas, ignoreTape = False ): diskSEs = set() tapeSEs = set() if not seList: ret = S_OK() ret.update( {'Successful': [], 'Failed': []} ) return ret for localSE in seList: seStatus = self.__storageElement( localSE ).getStatus()['Value'] if seStatus['Read'] and seStatus['DiskSE']: diskSEs.add( localSE ) elif seStatus['Read'] and seStatus['TapeSE']: tapeSEs.add( localSE ) # For the unlikely case that a file is found on two SEs at the same site # disk-based replicas are favoured. # Problematic files will be returned and can be handled by another module failedReplicas = set() newReplicasDict = {} for lfn, reps in replicas.items(): if lfn in self.inputData: # Check that all replicas are on a valid local SE if not [se for se in reps if se in diskSEs.union( tapeSEs )]: failedReplicas.add( lfn ) else: for seName in diskSEs & set( reps ): newReplicasDict.setdefault( lfn, [] ).append( seName ) if not newReplicasDict.get( lfn ) and not ignoreTape: for seName in tapeSEs & set( reps ): newReplicasDict.setdefault( lfn, [] ).append( seName ) # Check that all LFNs have at least one replica and GUID if failedReplicas: # in principle this is not a failure but depends on the policy of the VO # datasets could be downloaded from another site self.log.info( 'The following file(s) were found not to have replicas on any of %s:\n%s' % ( str( seList ), '\n'.join( sorted( failedReplicas ) ) ) ) # Need to group files by SE in order to stage optimally # we know from above that all remaining files have a replica # (preferring disk if >1) in the local storage. # IMPORTANT, only add replicas for input data that is requested # since this module could have been executed after another. seFilesDict = {} for lfn, seList in newReplicasDict.items(): for seName in seList: seFilesDict.setdefault( seName, [] ).append( lfn ) sortedSEs = sorted( [ ( len( lfns ), seName ) for seName, lfns in seFilesDict.items() ], reverse = True ) trackLFNs = {} for _len, seName in sortedSEs: for lfn in seFilesDict[seName]: if 'Size' in replicas[lfn] and 'GUID' in replicas[lfn]: trackLFNs.setdefault( lfn, [] ).append( { 'pfn': replicas.get( lfn, {} ).get( seName, lfn ), 'se': seName, 'size': replicas[lfn]['Size'], 'guid': replicas[lfn]['GUID'] } ) self.log.debug( 'Files grouped by SEs are:\n%s' % str( seFilesDict ) ) for seName, lfns in seFilesDict.items(): self.log.info( ' %s LFNs found from catalog at SE %s' % ( len( lfns ), seName ) ) self.log.verbose( '\n'.join( lfns ) ) # Can now start to obtain TURLs for files grouped by localSE # for requested input data requestedProtocol = self.configuration.get( 'Protocol', '' ) for seName, lfns in seFilesDict.items(): if not lfns: continue failedReps = set() result = self.__storageElement( seName ).getFileMetadata( lfns ) if not result['OK']: self.log.error( "Error getting metadata.", result['Message'] + ':\n%s' % '\n'.join( lfns ) ) # If we can not get MetaData, most likely there is a problem with the SE # declare the replicas failed and continue failedReps.update( lfns ) continue failed = result['Value']['Failed'] if failed: # If MetaData can not be retrieved for some PFNs # declared them failed and go on for lfn in failed: lfns.remove( lfn ) if type( failed ) == type( {} ): self.log.error( failed[ lfn ], lfn ) failedReps.add( lfn ) for lfn, metadata in result['Value']['Successful'].items(): if metadata['Lost']: error = "File has been Lost by the StorageElement %s" % seName elif metadata['Unavailable']: error = "File is declared Unavailable by the StorageElement %s" % seName elif seName in tapeSEs and not metadata['Cached']: error = "File is no longer in StorageElement %s Cache" % seName else: error = '' if error: lfns.remove( lfn ) self.log.error( error, lfn ) # If PFN is not available # declared it failed and go on failedReps.add( lfn ) if None in failedReps: failedReps.remove( None ) if not failedReps: self.log.info( 'Preliminary checks OK, getting TURLS at %s for:\n%s' % ( seName, '\n'.join( lfns ) ) ) else: self.log.warn( "Errors during preliminary checks for %d files" % len( failedReps ) ) result = self.__storageElement( seName ).getAccessUrl( lfns, protocol = requestedProtocol ) if not result['OK']: self.log.error( "Error getting TURLs", result['Message'] ) return result badTURLCount = 0 badTURLs = [] seResult = result['Value'] for lfn, cause in seResult['Failed'].items(): badTURLCount += 1 badTURLs.append( 'Failed to obtain TURL for %s: %s' % ( lfn, cause ) ) failedReps.add( lfn ) if badTURLCount: self.log.warn( 'Found %s problematic TURL(s) for job %s' % ( badTURLCount, self.jobID ) ) param = '\n'.join( badTURLs ) self.log.info( param ) result = self.__setJobParam( 'ProblematicTURLs', param ) if not result['OK']: self.log.warn( "Error setting job param", result['Message'] ) failedReplicas.update( failedReps ) for lfn, turl in seResult['Successful'].items(): for track in trackLFNs[lfn]: if track['se'] == seName: track['turl'] = turl break self.log.info( 'Resolved input data\n>>>> SE: %s\n>>>>LFN: %s\n>>>>TURL: %s' % ( seName, lfn, turl ) ) ##### End of loop on SE ####### # Check if the files were actually resolved (i.e. have a TURL) # If so, remove them from failed list for lfn, mdataList in trackLFNs.items(): for mdata in list( mdataList ): if 'turl' not in mdata: mdataList.remove( mdata ) self.log.info( 'No TURL resolved for %s at %s' % ( lfn, mdata['se'] ) ) if not mdataList: trackLFNs.pop( lfn, None ) failedReplicas.add( lfn ) elif lfn in failedReplicas: failedReplicas.remove( lfn ) self.log.debug( 'All resolved data', sorted( trackLFNs ) ) self.log.debug( 'All failed data', sorted( failedReplicas ) ) ret = S_OK() ret.update( {'Successful': trackLFNs, 'Failed': sorted( failedReplicas )} ) return ret
def initSEs(): ''' Initializes SEs statuses taking their values from the CS. ''' from DIRAC.ResourceStatusSystem.Client import ResourceStatusClient from DIRAC.ResourceStatusSystem.PolicySystem import StateMachine from DIRAC.ResourceStatusSystem.Utilities import CSHelpers, RssConfiguration #WarmUp local copy CSHelpers.warmUp() subLogger.info('Initializing SEs') rssClient = ResourceStatusClient.ResourceStatusClient() ses = CSHelpers.getStorageElements() if not ses['OK']: return ses ses = ses['Value'] statuses = StateMachine.RSSMachine(None).getStates() statusTypes = RssConfiguration.RssConfiguration().getConfigStatusType( 'StorageElement') reason = 'dirac-rss-sync' subLogger.debug(statuses) subLogger.debug(statusTypes) for se in ses: subLogger.debug(se) opts = gConfig.getOptionsDict('/Resources/StorageElements/%s' % se) if not opts['OK']: subLogger.warn(opts['Message']) continue opts = opts['Value'] subLogger.debug(opts) # We copy the list into a new object to remove items INSIDE the loop ! statusTypesList = statusTypes[:] for statusType, status in opts.iteritems(): #Sanity check... if not statusType in statusTypesList: continue #Transforms statuses to RSS terms if status in ('NotAllowed', 'InActive'): status = 'Banned' if not status in statuses: subLogger.error('%s not a valid status for %s - %s' % (status, se, statusType)) continue # We remove from the backtracking statusTypesList.remove(statusType) subLogger.debug([se, statusType, status, reason]) result = rssClient.addOrModifyStatusElement( 'Resource', 'Status', name=se, statusType=statusType, status=status, elementType='StorageElement', reason=reason) if not result['OK']: subLogger.error('Failed to modify') subLogger.error(result['Message']) continue #Backtracking: statusTypes not present on CS for statusType in statusTypesList: result = rssClient.addOrModifyStatusElement( 'Resource', 'Status', name=se, statusType=statusType, status=DEFAULT_STATUS, elementType='StorageElement', reason=reason) if not result['OK']: subLogger.error('Error in backtracking for %s,%s,%s' % (se, statusType, status)) subLogger.error(result['Message']) return S_OK()
def initializeOptimizer(cls): """Initialize specific parameters for JobSanityAgent.""" cls.sandboxClient = SandboxStoreClient(useCertificates=True, smdb=True) return S_OK()
def removeFile(self, index, requestObj, subRequestAttrs, subRequestFiles): """ action for 'removeFile' operation :param self: self reference :param int index: subRequest index in execution order :param RequestContainer requestObj: request :param dict subRequestAttrs: subRequest's attributes :param dict subRequestFiles: subRequest's files """ self.info("removeFile: processing subrequest %s" % index) if requestObj.isSubRequestEmpty(index, "removal")["Value"]: self.info( "removeFile: subrequest %s is empty, setting its status to 'Done'" % index) requestObj.setSubRequestStatus(index, "removal", "Done") return S_OK(requestObj) lfns = [ str(subRequestFile["LFN"]) for subRequestFile in subRequestFiles if subRequestFile["Status"] == "Waiting" and str(subRequestFile["LFN"]) ] self.debug("removeFile: about to remove %d files" % len(lfns)) # # keep removal status for each file removalStatus = dict.fromkeys(lfns, "") self.addMark("RemoveFileAtt", len(lfns)) # # bulk removal 1st bulkRemoval = self.replicaManager().removeFile(lfns) if not bulkRemoval["OK"]: self.error("removeFile: unable to remove files: %s" % bulkRemoval["Message"]) subRequestError = bulkRemoval["Message"][:255] subRequestError = requestObj.setSubRequestAttributeValue( index, "removal", "Error", subRequestError) return S_OK(requestObj) bulkRemoval = bulkRemoval["Value"] successfulLfns = bulkRemoval[ "Successful"] if "Successful" in bulkRemoval else [] failedLfns = bulkRemoval["Failed"] if "Failed" in bulkRemoval else [] toRemove = [] for lfn in removalStatus: if lfn in failedLfns and "no such file or directory" in str( bulkRemoval["Failed"][lfn]).lower(): removalStatus[lfn] = bulkRemoval["Failed"][lfn] removeCatalog = self.replicaManager().removeCatalogFile( lfn, singleFile=True) if not removeCatalog["OK"]: removalStatus[lfn] = removeCatalog["Message"] continue else: toRemove.append(lfn) # # loop over LFNs to remove for lfn in toRemove: self.debug("removeFile: processing file %s" % lfn) try: # # try to remove using proxy already defined in os.environ removal = self.replicaManager().removeFile(lfn) # # file is not existing? if not removal["OK"] and "no such file or directory" in str( removal["Message"]).lower(): removalStatus[lfn] = removal["Message"] continue # # not OK but request belongs to DataManager? if not self.requestOwnerDN and \ ( not removal["OK"] and "Write access not permitted for this credential." in removal["Message"] ) or \ ( removal["OK"] and "Failed" in removal["Value"] and lfn in removal["Value"]["Failed"] and "permission denied" in str( removal["Value"]["Failed"][lfn] ).lower() ): self.debug("removeFile: retrieving proxy for %s" % lfn) getProxyForLFN = self.getProxyForLFN(lfn) # # can't get correct proxy? continue... if not getProxyForLFN["OK"]: self.warn( "removeFile: unable to get proxy for file %s: %s" % (lfn, getProxyForLFN["Message"])) removal = getProxyForLFN else: # # you're a DataManager, retry with the new one proxy removal = self.replicaManager().removeFile(lfn) finally: # # make sure DataManager proxy is set back in place if not self.requestOwnerDN and self.dataManagerProxy(): # # remove temp proxy if os.environ["X509_USER_PROXY"] != self.dataManagerProxy( ): os.unlink(os.environ["X509_USER_PROXY"]) # # put back DataManager proxy os.environ["X509_USER_PROXY"] = self.dataManagerProxy() # # save error if not removal["OK"]: removalStatus[lfn] = removal["Message"] continue # # check fail reason, filter out missing files removal = removal["Value"] if lfn in removal["Failed"]: removalStatus[lfn] = removal["Failed"][lfn] # # counters filesRemoved = 0 filesFailed = 0 subRequestError = [] # # update File statuses and errors for lfn, error in removalStatus.items(): # # set file error if any if error: self.debug("removeFile: %s: %s" % (lfn, str(error))) fileError = str(error).replace("'", "")[:255] fileError = requestObj.setSubRequestFileAttributeValue( index, "removal", lfn, "Error", fileError) if not fileError["OK"]: self.error("removeFile: unable to set Error for %s: %s" % (lfn, fileError["Message"])) # # no error? file not exists? - we are able to recover if not error or "no such file or directory" in str( error ).lower() or \ "file does not exist" in str( error ).lower(): filesRemoved += 1 self.info("removeFile: successfully removed %s" % lfn) updateStatus = requestObj.setSubRequestFileAttributeValue( index, "removal", lfn, "Status", "Done") if not updateStatus["OK"]: self.error( "removeFile: unable to change status to 'Done' for %s" % lfn) else: filesFailed += 1 self.warn("removeFile: unable to remove file %s : %s" % (lfn, error)) errorStr = str(error) if type(error) == type({}): errorStr = ";".join([ "%s:%s" % (key, value) for key, value in error.items() ]) errorStr = errorStr.replace("'", "") subRequestError.append("%s:%s" % (lfn, errorStr)) self.addMark("RemoveFileDone", filesRemoved) self.addMark("RemoveFileFail", filesFailed) # # all 'Done'? if requestObj.isSubRequestDone(index, "removal")["Value"]: self.info( "removeFile: all files processed, setting subrequest status to 'Done'" ) requestObj.setSubRequestStatus(index, "removal", "Done") elif filesFailed: self.info( "removeFile: all files processed, %s files failed to remove" % filesFailed) subRequestError = ";".join(subRequestError)[:255] subRequestError = requestObj.setSubRequestAttributeValue( index, "removal", "Error", subRequestError) return S_OK(requestObj)
def getResourceUsage(self): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ cmd = 'qstat -f %s' % (self.jobID) result = runCommand(cmd) if not result['OK']: return result cpu = None cpuLimit = None wallClock = None wallClockLimit = None lines = str(result['Value']).split('\n') for line in lines: info = line.split() if re.search('.*resources_used.cput.*', line): if len(info) >= 3: cpuList = info[2].split(':') newcpu = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpu or newcpu > cpu: cpu = newcpu else: self.log.warn('Problem parsing "%s" for CPU consumed' % line) if re.search('.*resources_used.pcput.*', line): if len(info) >= 3: cpuList = info[2].split(':') newcpu = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpu or newcpu > cpu: cpu = newcpu else: self.log.warn('Problem parsing "%s" for CPU consumed' % line) if re.search('.*resources_used.walltime.*', line): if len(info) >= 3: wcList = info[2].split(':') wallClock = (float(wcList[0]) * 60 + float(wcList[1])) * 60 + float(wcList[2]) else: self.log.warn( 'Problem parsing "%s" for elapsed wall clock time' % line) if re.search('.*Resource_List.cput.*', line): if len(info) >= 3: cpuList = info[2].split(':') newcpuLimit = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpuLimit or newcpuLimit < cpuLimit: cpuLimit = newcpuLimit else: self.log.warn('Problem parsing "%s" for CPU limit' % line) if re.search('.*Resource_List.pcput.*', line): if len(info) >= 3: cpuList = info[2].split(':') newcpuLimit = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpuLimit or newcpuLimit < cpuLimit: cpuLimit = newcpuLimit else: self.log.warn('Problem parsing "%s" for CPU limit' % line) if re.search('.*Resource_List.walltime.*', line): if len(info) >= 3: wcList = info[2].split(':') wallClockLimit = (float(wcList[0]) * 60 + float(wcList[1])) * 60 + float(wcList[2]) else: self.log.warn('Problem parsing "%s" for wall clock limit' % line) consumed = { 'CPU': cpu, 'CPULimit': cpuLimit, 'WallClock': wallClock, 'WallClockLimit': wallClockLimit } self.log.debug(consumed) if None not in consumed.values(): self.log.debug("TimeLeft counters complete:", str(consumed)) return S_OK(consumed) else: missed = [key for key, val in consumed.items() if val is None] self.log.info('Could not determine parameter', ','.join(missed)) self.log.debug( 'This is the stdout from the batch system call\n%s' % (result['Value'])) if cpuLimit or wallClockLimit: # We have got a partial result from PBS, assume that we ran for too short time if not cpuLimit: consumed['CPULimit'] = wallClockLimit * 0.8 if not wallClockLimit: consumed['WallClockLimit'] = cpuLimit / 0.8 if not cpu: consumed['CPU'] = int(time.time() - self.startTime) if not wallClock: consumed['WallClock'] = int(time.time() - self.startTime) self.log.debug("TimeLeft counters restored:", str(consumed)) return S_OK(consumed) else: msg = 'Could not determine some parameters' self.log.info( msg, ':\nThis is the stdout from the batch system call\n%s' % (result['Value'])) retVal = S_ERROR(msg) retVal['Value'] = consumed return retVal
def reTransfer(self, index, requestObj, subRequestAttrs, subRequestFiles): """ action for 'reTransfer' operation :param self: self reference :param int index: subRequest index in execution order :param RequestContainer requestObj: request :param dict subRequestAttrs: subRequest's attributes :param dict subRequestFiles: subRequest's files """ self.info("reTransfer: processing subrequest %s" % index) if requestObj.isSubRequestEmpty(index, "removal")["Value"]: self.info( "reTransfer: subrequest %s is empty, setting its status to 'Done'" % index) requestObj.setSubRequestStatus(index, "removal", "Done") return S_OK(requestObj) subRequestError = [] targetSEs = list( set([ targetSE.strip() for targetSE in subRequestAttrs["TargetSE"].split(",") if targetSE.strip() ])) lfnsPfns = [(subFile["LFN"], subFile["PFN"], subFile["Status"]) for subFile in subRequestFiles] failed = {} for lfn, pfn, status in lfnsPfns: self.info("reTransfer: processing file %s" % lfn) if status != "Waiting": self.info("reTransfer: skipping file %s, status is %s" % (lfn, status)) continue failed.setdefault(lfn, {}) for targetSE in targetSEs: reTransfer = self.replicaManager().onlineRetransfer( targetSE, pfn) if reTransfer["OK"]: if pfn in reTransfer["Value"]["Successful"]: self.info( "reTransfer: succesfully requested retransfer of %s" % pfn) else: reason = reTransfer["Value"]["Failed"][pfn] self.error( "reTransfer: failed to set retransfer request for %s at %s: %s" % (pfn, targetSE, reason)) failed[lfn][targetSE] = reason subRequestError.append("%s:%s:%s" % (lfn, targetSE, reason)) else: self.error( "reTransfer: completely failed to retransfer: %s" % reTransfer["Message"]) failed[lfn][targetSE] = reTransfer["Message"] subRequestError.append( "%s:%s:%s" % (lfn, targetSE, reTransfer["Message"])) if not failed[lfn]: self.info( "reTransfer: file %s sucessfully processed at all targetSEs" % lfn) requestObj.setSubRequestFileAttributeValue( index, "removal", lfn, "Status", "Done") # # subrequest empty or all Files done? if requestObj.isSubRequestDone(index, "removal")["Value"]: self.info( "reTransfer: all files processed, setting subrequest status to 'Done'" ) requestObj.setSubRequestStatus(index, "removal", "Done") else: subRequestError = requestObj.setSubRequestAttributeValue( index, "removal", "Error", ";".join(subRequestError)[:255]) return S_OK(requestObj)
def submitJobs( self ): """ Go through defined computing elements and submit jobs if necessary """ queues = self.queueDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup':setup, 'CPUTime': 9999999, 'SubmitPool' : self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms( self.platforms ) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for queue in queues: tags += self.queueDict[queue]['ParametersDict']['Tags'] tqDict['Tag'] = list( set( tags ) ) self.log.verbose( 'Checking overall TQ availability with requirements' ) self.log.verbose( tqDict ) rpcMatcher = RPCClient( "WorkloadManagement/Matcher" ) result = rpcMatcher.getMatchingTaskQueues( tqDict ) if not result[ 'OK' ]: return result if not result['Value']: self.log.verbose( 'No Waiting jobs suitable for the director' ) return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add( site ) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add( site ) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() self.log.info( tqIDList ) result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None ) tagWaitingPilots = 0 if result['OK']: tagWaitingPilots = result['Value'] self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % ( totalWaitingJobs, len( tqIDList ), tagWaitingPilots ) ) self.log.info( 'Queues: ', self.queueDict.keys() ) # if tagWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() if self.rssFlag: result = self.siteClient.getUsableSites() if not result['OK']: return S_ERROR( 'Can not get the site status' ) siteMaskList = result['Value'] else: # Use the old way, check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR( 'Can not get the site mask' ) siteMaskList = result['Value'] random.shuffle( queues ) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues[ queue ] % self.failedQueueCycleFactor if failedCount != 0: self.log.warn( "%s queue failed recently, skipping %d cycles" % ( queue, 10 - failedCount ) ) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] platform = self.queueDict[queue]['Platform'] queueTags = self.queueDict[queue]['ParametersDict']['Tags'] siteMask = siteName in siteMaskList processorTags = [] for tag in queueTags: if re.match( r'^[0-9]+Processors$', tag ): processorTags.append( tag ) if 'WholeNode' in queueTags: processorTags.append( 'WholeNode' ) if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % ( queueName, siteName ) ) continue if not siteMask and siteName not in testSites: self.log.verbose( "Skipping queue %s at site %s not in the mask" % ( queueName, siteName ) ) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict'] : queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] ) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue ) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict[ 'GridCE' ] = ceName # if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms( platform ) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = processorTags # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues( ceDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose( 'No matching TQs found for %s' % queue ) continue matchedQueues += 1 totalTQJobs = 0 totalTQJobsByProcessors = {} tqIDList = taskQueueDict.keys() tqIDListByProcessors = {} for tq in taskQueueDict: if 'Tags' not in taskQueueDict[tq]: # skip non multiprocessor tqs continue for tag in taskQueueDict[tq]['Tags']: if tag in processorTags: tqIDListByProcessors.setdefault( tag, [] ) tqIDListByProcessors[tag].append( tq ) totalTQJobsByProcessors.setdefault( tag, 0 ) totalTQJobsByProcessors[tag] += taskQueueDict[tq]['Jobs'] totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % ( totalTQJobs, len( tqIDList ), queue ) ) queueSubmittedPilots = 0 for tag in tqIDListByProcessors: self.log.verbose( "Try to submit pilots for Tag=%s (TQs=%s)" % ( tag, tqIDListByProcessors[tag] ) ) processors = 1 m = re.match( r'^(?P<processors>[0-9]+)Processors$', tag ) if m: processors = int( m.group( 'processors' ) ) if tag == 'WholeNode' : processors = -1 tagTQJobs = totalTQJobsByProcessors[tag] tagTqIDList = tqIDListByProcessors[tag] # Get the number of already waiting pilots for these task queues tagWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( {'TaskQueueID': tagTqIDList, 'Status': WAITING_PILOT_STATUS}, None, lastUpdateTime ) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] ) tagWaitingPilots = 0 else: tagWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots ) if tagWaitingPilots >= tagTQJobs: self.log.verbose( "%d waiting pilots already for all the available jobs" % tagWaitingPilots ) continue self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % ( tagWaitingPilots, tagTQJobs, queue ) ) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) ) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, cpuTime - 60 ) # Get the number of available slots on the target site/queue totalSlots = self.getQueueSlots( queue, False ) if totalSlots == 0: self.log.debug( '%s: No slots available' % queue ) continue # Note: comparing slots to job numbers is not accurate in multiprocessor case. # This could lead to over submission. pilotsToSubmit = max( 0, min( totalSlots, tagTQJobs - tagWaitingPilots ) ) self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \ ( queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit ) while pilotsToSubmit > 0: self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) ) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False ) jobExecDir = '' jobExecDir = self.queueDict[queue]['ParametersDict'].get( 'JobExecDir', jobExecDir ) httpProxy = self.queueDict[queue]['ParametersDict'].get( 'HttpProxy', '' ) result = self.getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir ) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob( executable, '', pilotSubmissionChunk, processors = processors ) # ## FIXME: The condor thing only transfers the file with some # ## delay, so when we unlink here the script is gone # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts... if ceType != 'HTCondorCE': os.unlink( executable ) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] ) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk queueSubmittedPilots += pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len( pilotList ) totalSubmittedPilots += len( pilotList ) self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) ) stampDict = {} if result.has_key( 'PilotStampDict' ): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in tagTqIDList: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append( ( tq, sumPriority ) ) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key( tqID ): tqDict[tqID] = [] tqDict[tqID].append( pilotID ) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict ) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] ) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName ) if not result['OK']: self.log.error( 'Failed to set pilot status: ', result['Message'] ) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % ( totalSubmittedPilots, matchedQueues ) ) return S_OK()
def pfnparse(pfn): """ parse pfn and save all bits of information into dictionary :param str pfn: pfn string """ if not pfn: return S_ERROR( "wrong 'pfn' argument value in function call, expected non-empty string, got %s" % str(pfn)) pfnDict = dict.fromkeys( ["Protocol", "Host", "Port", "WSUrl", "Path", "FileName"], "") try: if ":" not in pfn: # pfn = /a/b/c pfnDict["Path"] = os.path.dirname(pfn) pfnDict["FileName"] = os.path.basename(pfn) else: # pfn = protocol:/a/b/c # pfn = protocol://host/a/b/c # pfn = protocol://host:port/a/b/c # pfn = protocol://host:port/wsurl?=/a/b/c pfnDict["Protocol"] = pfn[0:pfn.index(":")] ## remove protocol: pfn = pfn[len(pfnDict["Protocol"]):] ## remove :// or : pfn = pfn[3:] if pfn.startswith("://") else pfn[1:] if pfn.startswith("/"): ## /a/b/c pfnDict["Path"] = os.path.dirname(pfn) pfnDict["FileName"] = os.path.basename(pfn) else: ## host/a/b/c ## host:port/a/b/c ## host:port/wsurl?=/a/b/c if ":" not in pfn: ## host/a/b/c pfnDict["Host"] = pfn[0:pfn.index("/")] pfn = pfn[len(pfnDict["Host"]):] pfnDict["Path"] = os.path.dirname(pfn) pfnDict["FileName"] = os.path.basename(pfn) else: ## host:port/a/b/c ## host:port/wsurl?=/a/b/c pfnDict["Host"] = pfn[0:pfn.index(":")] ## port/a/b/c ## port/wsurl?=/a/b/c pfn = pfn[len(pfnDict["Host"]) + 1:] pfnDict["Port"] = pfn[0:pfn.index("/")] ## /a/b/c ## /wsurl?=/a/b/c pfn = pfn[len(pfnDict["Port"]):] WSUrl = pfn.find("?") WSUrlEnd = pfn.find("=") if WSUrl == -1 and WSUrlEnd == -1: ## /a/b/c pfnDict["Path"] = os.path.dirname(pfn) pfnDict["FileName"] = os.path.basename(pfn) else: ## /wsurl?blah=/a/b/c pfnDict["WSUrl"] = pfn[0:WSUrlEnd + 1] ## /a/b/c pfn = pfn[len(pfnDict["WSUrl"]):] pfnDict["Path"] = os.path.dirname(pfn) pfnDict["FileName"] = os.path.basename(pfn) return S_OK(pfnDict) except Exception: errStr = "Pfn.pfnparse: Exception while parsing pfn: " + str(pfn) gLogger.exception(errStr) return S_ERROR(errStr)
class RequestAgentBase(object): """ .. class:: RequestAgentBase Helper class for DMS agents dealing with RequestContainers and Requests. """ ## reference to ReplicaManager __replicaManager = None ## reference to DataLoggingClient __dataLoggingClient = None ## reference to RequestClient __requestClient = None ## reference to RequestDbMySQL __requestDBMySQL = None ## reference to TransferDB itself __transferDB = None ## reference to StotageFactory __storageFactory = None ############################################## # componets getters @classmethod def replicaManager(cls): """ ReplicaManager getter :param cls: class reference """ if not cls.__replicaManager: cls.__replicaManager = ReplicaManager() return cls.__replicaManager @classmethod def dataLoggingClient(cls): """ DataLoggingClient getter :param cls: class reference """ if not cls.__dataLoggingClient: cls.__dataLoggingClient = DataLoggingClient() return cls.__dataLoggingClient @classmethod def requestClient(cls): """ RequestClient getter :param cls: class reference """ if not cls.__requestClient: cls.__requestClient = RequestClient() return cls.__requestClient @classmethod def requestDBMySQL(cls): """ RequestDBMySQL getter :param cls: class reference """ if not cls.__requestDBMySQL: cls.__requestDBMySQL = RequestDBMySQL() return cls.__requestDBMySQL @classmethod def transferDB(cls): """ TransferDB getter :param cls: class reference """ if not cls.__transferDB: cls.__transferDB = TransferDB() return cls.__transferDB @classmethod def storageFactory(cls): """ StorageFactory getter :param cls: class reference """ if not cls.__storageFactory: cls.__storageFactory = StorageFactory() return cls.__storageFactory @classmethod def getRequestDict(cls, requestType): """ retrive Request of type requestType from RequestDB :param cls: class reference :param str requestType: type of request :return: S_ERROR on error :return: S_OK with request dictionary:: requestDict = { "requestString" : str, "requestName" : str, "sourceServer" : str, "executionOrder" : list, "requestObj" : RequestContainer, "jobId" : int } """ ## prepare requestDict requestDict = { "requestString": None, "requestName": None, "sourceServer": None, "executionOrder": None, "requestObj": None, "jobId": None } ## get request out of DB res = cls.requestClient().getRequest(requestType) if not res["OK"]: gLogger.error(res["Message"]) return res elif not res["Value"]: msg = "Request of type '%s' not found in RequestDB." % requestType gLogger.info(msg) return S_OK() ## store values requestDict["requestName"] = res["Value"]["RequestName"] requestDict["requestString"] = res["Value"]["RequestString"] requestDict["sourceServer"] = res["Value"]["Server"] requestDict["requestObj"] = RequestContainer( request=requestDict["requestString"]) ## get JobID try: requestDict["jobId"] = int(res["JobID"]) except ValueError, exc: gLogger.warn( "Cannot read JobID for request %s, setting it to 0: %s" % (requestDict["requestName"], str(exc))) requestDict["jobId"] = 0 ## get the execution order res = cls.requestClient().getCurrentExecutionOrder( requestDict["requestName"], requestDict["sourceServer"]) if not res["OK"]: msg = "Can not get the execution order for request %s." % requestDict[ "requestName"] gLogger.error(msg, res["Message"]) return res requestDict["executionOrder"] = res["Value"] ## return requestDict return S_OK(requestDict)
def getName(self): """ Get the catalog type name """ return S_OK(self.name)
def _Broadcast(self): """ This plug-in takes files found at the sourceSE and broadcasts to all (or a selection of) targetSEs. """ if not self.params: return S_ERROR( "TransformationPlugin._Broadcast: The 'Broadcast' plugin requires additional parameters." ) targetseParam = self.params['TargetSE'] targetSEs = [] sourceSEs = eval(self.params['SourceSE']) if targetseParam.count('['): targetSEs = eval(targetseParam) elif isinstance(targetseParam, list): targetSEs = targetseParam else: targetSEs = [targetseParam] # sourceSEs = eval(self.params['SourceSE']) # targetSEs = eval(self.params['TargetSE']) destinations = int(self.params.get('Destinations', 0)) if destinations and (destinations >= len(targetSEs)): destinations = 0 status = self.params['Status'] groupSize = self.params['GroupSize'] # Number of files per tasks fileGroups = getFileGroups(self.data) # groups by SE targetSELfns = {} for replicaSE, lfns in fileGroups.items(): ses = replicaSE.split(',') # sourceSites = self._getSitesForSEs(ses) atSource = False for se in ses: if se in sourceSEs: atSource = True if not atSource: continue for lfn in lfns: targets = [] sources = self._getSitesForSEs(ses) random.shuffle(targetSEs) for targetSE in targetSEs: site = self._getSiteForSE(targetSE)['Value'] if not site in sources: if (destinations) and (len(targets) >= destinations): continue sources.append(site) targets.append( targetSE ) # after all, if someone wants to copy to the source, it's his choice strTargetSEs = str.join(',', sorted(targets)) if not targetSELfns.has_key(strTargetSEs): targetSELfns[strTargetSEs] = [] targetSELfns[strTargetSEs].append(lfn) tasks = [] for ses, lfns in targetSELfns.items(): tasksLfns = breakListIntoChunks(lfns, groupSize) for taskLfns in tasksLfns: if (status == 'Flush') or (len(taskLfns) >= int(groupSize)): # do not allow groups smaller than the groupSize, except if transformation is in flush state tasks.append((ses, taskLfns)) return S_OK(tasks)
def getStorages(self, storageName, pluginList=None, hideExceptions=False): """ Get an instance of a Storage based on the DIRAC SE name based on the CS entries CS :param storageName: is the DIRAC SE name i.e. 'CERN-RAW' :param pluginList: is an optional list of protocols if a sub-set is desired i.e ['SRM2','SRM1'] :return: dictionary containing storage elements and information about them """ self.remotePlugins = [] self.localPlugins = [] self.name = '' self.options = {} self.protocols = {} self.storages = [] if pluginList is None: pluginList = [] elif isinstance(pluginList, basestring): pluginList = [pluginList] if not self.vo: gLogger.warn('No VO information available') # Get the name of the storage provided res = self._getConfigStorageName(storageName, 'Alias') if not res['OK']: return res storageName = res['Value'] self.name = storageName # In case the storage is made from a base SE, get this information res = self._getConfigStorageName(storageName, 'BaseSE') if not res['OK']: return res # If the storage is derived frmo another one, keep the information # We initialize the seConfigPath to SE_BASE_CONFIG_PATH if there is a derivedSE, SE_CONFIG_PATH if not if res['Value'] != storageName: derivedStorageName = storageName storageName = res['Value'] seConfigPath = SE_BASE_CONFIG_PATH else: derivedStorageName = None seConfigPath = SE_CONFIG_PATH # Get the options defined in the CS for this storage res = self._getConfigStorageOptions(storageName, derivedStorageName=derivedStorageName, seConfigPath=seConfigPath) if not res['OK']: # This is for the backward compatibility and to invite developer to move their BaseSE in the correct section gLogger.warn("Deprecated configuration, you can ignore the error message above." " Please move the baseSE in the correct section: ", SE_BASE_CONFIG_PATH) # We change the value of seConfigPath to avoid other errors due to the bad SE_BASE_CONFIG_PATH seConfigPath = SE_CONFIG_PATH res = self._getConfigStorageOptions(storageName, derivedStorageName=derivedStorageName, seConfigPath=seConfigPath) if not res['OK']: return res self.options = res['Value'] # Get the protocol specific details res = self._getConfigStorageProtocols(storageName, derivedStorageName=derivedStorageName, seConfigPath=seConfigPath) if not res['OK']: return res self.protocols = res['Value'] requestedLocalPlugins = [] requestedRemotePlugins = [] requestedProtocolDetails = [] turlProtocols = [] # Generate the protocol specific plug-ins for protocolSection, protocolDetails in self.protocols.iteritems(): pluginName = protocolDetails.get('PluginName', protocolSection) if pluginList and pluginName not in pluginList: continue protocol = protocolDetails['Protocol'] result = self.__generateStorageObject(storageName, pluginName, protocolDetails, hideExceptions=hideExceptions) if result['OK']: self.storages.append(result['Value']) if pluginName in self.localPlugins: turlProtocols.append(protocol) requestedLocalPlugins.append(pluginName) if pluginName in self.remotePlugins: requestedRemotePlugins.append(pluginName) requestedProtocolDetails.append(protocolDetails) else: gLogger.info(result['Message']) if self.storages: resDict = {} resDict['StorageName'] = self.name resDict['StorageOptions'] = self.options resDict['StorageObjects'] = self.storages resDict['LocalPlugins'] = requestedLocalPlugins resDict['RemotePlugins'] = requestedRemotePlugins resDict['ProtocolOptions'] = requestedProtocolDetails resDict['TurlProtocols'] = turlProtocols return S_OK(resDict) else: errStr = "StorageFactory.getStorages: Failed to instantiate any storage protocols." gLogger.error(errStr, self.name) return S_ERROR(errStr)
def __call__(self): """ remove replicas """ # The flag 'rmsMonitoring' is set by the RequestTask and is False by default. # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase. if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring") else: # # gMonitor stuff gMonitor.registerActivity("RemoveReplicaAtt", "Replica removals attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RemoveReplicaOK", "Successful replica removals", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RemoveReplicaFail", "Failed replica removals", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) # # prepare list of targetSEs targetSEs = self.operation.targetSEList # # check targetSEs for removal bannedTargets = self.checkSEsRSS(targetSEs, access='RemoveAccess') if not bannedTargets['OK']: if self.rmsMonitoring: for status in ["Attempted", "Failed"]: self.rmsMonitoringReporter.addRecord( self.createRMSRecord(status, len(self.operation)) ) self.rmsMonitoringReporter.commit() else: gMonitor.addMark("RemoveReplicaAtt") gMonitor.addMark("RemoveReplicaFail") return bannedTargets if bannedTargets['Value']: return S_OK("%s targets are banned for removal" % ",".join(bannedTargets['Value'])) # # get waiting files waitingFiles = self.getWaitingFilesList() # # and prepare dict toRemoveDict = dict((opFile.LFN, opFile) for opFile in waitingFiles) self.log.info("Todo: %s replicas to delete from %s SEs" % (len(toRemoveDict), len(targetSEs))) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Attempted", len(toRemoveDict)) ) else: gMonitor.addMark("RemoveReplicaAtt", len(toRemoveDict) * len(targetSEs)) # # keep status for each targetSE removalStatus = dict.fromkeys(toRemoveDict, None) for lfn in removalStatus: removalStatus[lfn] = dict.fromkeys(targetSEs, None) # # loop over targetSEs for targetSE in targetSEs: self.log.info("Removing replicas at %s" % targetSE) # # 1st step - bulk removal bulkRemoval = self._bulkRemoval(toRemoveDict, targetSE) if not bulkRemoval["OK"]: self.log.error('Bulk replica removal failed', bulkRemoval["Message"]) if self.rmsMonitoring: self.rmsMonitoringReporter.commit() return bulkRemoval # # report removal status for successful files if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Successful", len(([opFile for opFile in toRemoveDict.values() if not opFile.Error]))) ) else: gMonitor.addMark("RemoveReplicaOK", len([opFile for opFile in toRemoveDict.values() if not opFile.Error])) # # 2nd step - process the rest again toRetry = dict((lfn, opFile) for lfn, opFile in toRemoveDict.items() if opFile.Error) for lfn, opFile in toRetry.items(): self._removeWithOwnerProxy(opFile, targetSE) if opFile.Error: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1) ) else: gMonitor.addMark("RemoveReplicaFail", 1) removalStatus[lfn][targetSE] = opFile.Error else: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Successful", 1) ) else: gMonitor.addMark("RemoveReplicaOK", 1) # # update file status for waiting files failed = 0 for opFile in self.operation: if opFile.Status == "Waiting": errors = list(set(error for error in removalStatus[opFile.LFN].values() if error)) if errors: opFile.Error = "\n".join(errors) # This seems to be the only unrecoverable error if "Write access not permitted for this credential" in opFile.Error: failed += 1 opFile.Status = "Failed" else: opFile.Status = "Done" if failed: self.operation.Error = "failed to remove %s replicas" % failed if self.rmsMonitoring: self.rmsMonitoringReporter.commit() return S_OK()
def renewFromMyProxy( self, userDN, userGroup, lifeTime = False, chain = False ): if not lifeTime: lifeTime = 43200 if not self.__useMyProxy: return S_ERROR( "myproxy is disabled" ) #Get the chain if not chain: retVal = self.__getPemAndTimeLeft( userDN, userGroup ) if not retVal[ 'OK' ]: return retVal pemData = retVal[ 'Value' ][0] chain = X509Chain() retVal = chain.loadProxyFromString( pemData ) if not retVal[ 'OK' ]: return retVal originChainLifeTime = chain.getRemainingSecs()[ 'Value' ] maxMyProxyLifeTime = self.getMyProxyMaxLifeTime() #If we have a chain that's 0.8 of max mplifetime don't ask to mp if originChainLifeTime > maxMyProxyLifeTime * 0.8: self.log.error( "Skipping myproxy download", "user %s %s chain has %s secs and requested %s secs" % ( userDN, userGroup, originChainLifeTime, maxMyProxyLifeTime ) ) return S_OK( chain ) lifeTime *= 1.3 if lifeTime > maxMyProxyLifeTime: lifeTime = maxMyProxyLifeTime self.log.error( "Renewing proxy from myproxy", "user %s %s for %s secs" % ( userDN, userGroup, lifeTime ) ) myProxy = MyProxy( server = self.getMyProxyServer() ) retVal = myProxy.getDelegatedProxy( chain, lifeTime ) if not retVal[ 'OK' ]: return retVal mpChain = retVal[ 'Value' ] retVal = mpChain.getRemainingSecs() if not retVal[ 'OK' ]: return S_ERROR( "Can't retrieve remaining secs from renewed proxy: %s" % retVal[ 'Message' ] ) mpChainSecsLeft = retVal['Value'] if mpChainSecsLeft < originChainLifeTime: self.log.info( "Chain downloaded from myproxy has less lifetime than the one stored in the db", "\n Downloaded from myproxy: %s secs\n Stored in DB: %s secs" % ( mpChainSecsLeft, originChainLifeTime ) ) return S_OK( chain ) retVal = mpChain.getDIRACGroup() if not retVal[ 'OK' ]: return S_ERROR( "Can't retrieve DIRAC Group from renewed proxy: %s" % retVal[ 'Message' ] ) chainGroup = retVal['Value'] if chainGroup != userGroup: return S_ERROR( "Mismatch between renewed proxy group and expected: %s vs %s" % ( userGroup, chainGroup ) ) retVal = self.storeProxy( userDN, userGroup, mpChain ) if not retVal[ 'OK' ]: self.log.error( "Cannot store proxy after renewal", retVal[ 'Message' ] ) retVal = myProxy.getServiceDN() if not retVal[ 'OK' ]: hostDN = userDN else: hostDN = retVal[ 'Value' ] self.logAction( "myproxy renewal", hostDN, "host", userDN, userGroup ) return S_OK( mpChain )
def getPilotMonitorWeb(self, selectDict, sortList, startItem, maxItems): """ Get summary of the pilot job information in a standard structure """ resultDict = {} last_update = None if selectDict.has_key('LastUpdateTime'): last_update = selectDict['LastUpdateTime'] del selectDict['LastUpdateTime'] if selectDict.has_key('Owner'): userList = selectDict['Owner'] if type(userList) != type([]): userList = [userList] dnList = [] for uName in userList: uList = getDNForUsername(uName)['Value'] dnList += uList selectDict['OwnerDN'] = dnList del selectDict['Owner'] startDate = selectDict.get('FromDate', None) if startDate: del selectDict['FromDate'] # For backward compatibility if startDate is None: startDate = selectDict.get('LastUpdateTime', None) if startDate: del selectDict['LastUpdateTime'] endDate = selectDict.get('ToDate', None) if endDate: del selectDict['ToDate'] # Sorting instructions. Only one for the moment. if sortList: orderAttribute = sortList[0][0] + ":" + sortList[0][1] else: orderAttribute = None # Select pilots for the summary result = self.selectPilots(selectDict, orderAttribute=orderAttribute, newer=startDate, older=endDate, timeStamp='LastUpdateTime') if not result['OK']: return S_ERROR('Failed to select pilots: ' + result['Message']) pList = result['Value'] nPilots = len(pList) resultDict['TotalRecords'] = nPilots if nPilots == 0: return S_OK(resultDict) ini = startItem last = ini + maxItems if ini >= nPilots: return S_ERROR('Item number out of range') if last > nPilots: last = nPilots pilotList = pList[ini:last] paramNames = [ 'PilotJobReference', 'OwnerDN', 'OwnerGroup', 'GridType', 'Broker', 'Status', 'DestinationSite', 'BenchMark', 'ParentID', 'SubmissionTime', 'PilotID', 'LastUpdateTime', 'CurrentJobID', 'TaskQueueID', 'GridSite' ] result = self.getPilotInfo(pilotList, paramNames=paramNames) if not result['OK']: return S_ERROR('Failed to get pilot info: ' + result['Message']) pilotDict = result['Value'] records = [] for pilot in pilotList: parList = [] for parameter in paramNames: if type(pilotDict[pilot][parameter]) not in [ IntType, LongType ]: parList.append(str(pilotDict[pilot][parameter])) else: parList.append(pilotDict[pilot][parameter]) if parameter == 'GridSite': gridSite = pilotDict[pilot][parameter] # If the Grid Site is unknown try to recover it in the last moment if gridSite == "Unknown": ce = pilotDict[pilot]['DestinationSite'] result = getSiteForCE(ce) if result['OK']: gridSite = result['Value'] del parList[-1] parList.append(gridSite) records.append(parList) resultDict['ParameterNames'] = paramNames resultDict['Records'] = records return S_OK(resultDict)
def getPilotInfo(self, pilotRef=False, parentId=False, conn=False, paramNames=[], pilotID=False): """ Get all the information for the pilot job reference or reference list """ parameters = [ 'PilotJobReference', 'OwnerDN', 'OwnerGroup', 'GridType', 'Broker', 'Status', 'DestinationSite', 'BenchMark', 'ParentID', 'OutputReady', 'AccountingSent', 'SubmissionTime', 'PilotID', 'LastUpdateTime', 'TaskQueueID', 'GridSite', 'PilotStamp', 'Queue' ] if paramNames: parameters = paramNames cmd = "SELECT %s FROM PilotAgents" % ", ".join(parameters) condSQL = [] if pilotRef: if type(pilotRef) == ListType: condSQL.append("PilotJobReference IN (%s)" % ",".join(['"%s"' % x for x in pilotRef])) else: condSQL.append("PilotJobReference = '%s'" % pilotRef) if pilotID: if type(pilotID) == ListType: condSQL.append("PilotID IN (%s)" % ",".join(['%s' % x for x in pilotID])) else: condSQL.append("PilotID = '%s'" % pilotID) if parentId: if type(parentId) == ListType: condSQL.append("ParentID IN (%s)" % ",".join(['%s' % x for x in parentId])) else: condSQL.append("ParentID = %s" % parentId) if condSQL: cmd = "%s WHERE %s" % (cmd, " AND ".join(condSQL)) result = self._query(cmd, conn=conn) if not result['OK']: return result if not result['Value']: msg = "No pilots found" if pilotRef: msg += " for PilotJobReference(s): %s" % pilotRef if parentId: msg += " with parent id: %s" % parentId return S_ERROR(msg) resDict = {} pilotIDs = [] for row in result['Value']: pilotDict = {} for i in range(len(parameters)): pilotDict[parameters[i]] = row[i] if parameters[i] == 'PilotID': pilotIDs.append(row[i]) resDict[row[0]] = pilotDict result = self.getJobsForPilot(pilotIDs) if not result['OK']: return S_OK(resDict) jobsDict = result['Value'] for pilotRef in resDict: pilotInfo = resDict[pilotRef] pilotID = pilotInfo['PilotID'] if pilotID in jobsDict: pilotInfo['Jobs'] = jobsDict[pilotID] return S_OK(resDict)
def getPilotSummaryWeb(self, selectDict, sortList, startItem, maxItems): """ Get summary of the pilot jobs status by CE/site in a standard structure """ stateNames = [ 'Submitted', 'Ready', 'Scheduled', 'Waiting', 'Running', 'Done', 'Aborted' ] allStateNames = stateNames + ['Done_Empty', 'Aborted_Hour'] paramNames = ['Site', 'CE'] + allStateNames resultDict = {} last_update = None if selectDict.has_key('LastUpdateTime'): last_update = selectDict['LastUpdateTime'] del selectDict['LastUpdateTime'] site_select = [] if selectDict.has_key('GridSite'): site_select = selectDict['GridSite'] if type(site_select) != type([]): site_select = [site_select] del selectDict['GridSite'] status_select = [] if selectDict.has_key('Status'): status_select = selectDict['Status'] if type(status_select) != type([]): status_select = [status_select] del selectDict['Status'] expand_site = '' if selectDict.has_key('ExpandSite'): expand_site = selectDict['ExpandSite'] site_select = [expand_site] del selectDict['ExpandSite'] start = time.time() # Get all the data from the database with various selections result = self.getCounters('PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer=last_update, timeStamp='LastUpdateTime') if not result['OK']: return result last_update = Time.dateTime() - Time.hour selectDict['Status'] = 'Aborted' resultHour = self.getCounters( 'PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer=last_update, timeStamp='LastUpdateTime') if not resultHour['OK']: return resultHour last_update = Time.dateTime() - Time.day selectDict['Status'] = ['Aborted', 'Done'] resultDay = self.getCounters('PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer=last_update, timeStamp='LastUpdateTime') if not resultDay['OK']: return resultDay selectDict['CurrentJobID'] = 0 selectDict['Status'] = 'Done' resultDayEmpty = self.getCounters( 'PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer=last_update, timeStamp='LastUpdateTime') if not resultDayEmpty['OK']: return resultDayEmpty ceMap = {} resMap = getCESiteMapping() if resMap['OK']: ceMap = resMap['Value'] # Sort out different counters resultDict = {} resultDict['Unknown'] = {} for attDict, count in result['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ce != "Multiple" and ceMap.has_key( ce): site = ceMap[ce] if not resultDict.has_key(site): resultDict[site] = {} if not resultDict[site].has_key(ce): resultDict[site][ce] = {} for p in allStateNames: resultDict[site][ce][p] = 0 resultDict[site][ce][state] = count for attDict, count in resultDay['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ceMap.has_key(ce): site = ceMap[ce] if state == "Done": resultDict[site][ce]["Done"] = count if state == "Aborted": resultDict[site][ce]["Aborted"] = count for attDict, count in resultDayEmpty['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ceMap.has_key(ce): site = ceMap[ce] if state == "Done": resultDict[site][ce]["Done_Empty"] = count for attDict, count in resultHour['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ceMap.has_key(ce): site = ceMap[ce] if state == "Aborted": resultDict[site][ce]["Aborted_Hour"] = count records = [] siteSumDict = {} for site in resultDict: sumDict = {} for state in allStateNames: if not sumDict.has_key(state): sumDict[state] = 0 sumDict['Total'] = 0 for ce in resultDict[site]: itemList = [site, ce] total = 0 for state in allStateNames: itemList.append(resultDict[site][ce][state]) sumDict[state] += resultDict[site][ce][state] if state == "Done": done = resultDict[site][ce][state] if state == "Done_Empty": empty = resultDict[site][ce][state] if state == "Aborted": aborted = resultDict[site][ce][state] if state == "Aborted_Hour": aborted_hour = resultDict[site][ce][state] if state != "Aborted_Hour" and state != "Done_Empty": total += resultDict[site][ce][state] sumDict['Total'] += total # Add the total number of pilots seen in the last day itemList.append(total) # Add pilot submission efficiency evaluation if (done - empty) > 0: eff = float(done) / float(done - empty) elif done == 0: eff = 0. elif empty == done: eff = 99. else: eff = 0. itemList.append('%.2f' % eff) # Add pilot job efficiency evaluation if total > 0: eff = float(total - aborted) / float(total) * 100. else: eff = 100. itemList.append('%.2f' % eff) # Evaluate the quality status of the CE if total > 10: if eff < 25.: itemList.append('Bad') elif eff < 60.: itemList.append('Poor') elif eff < 85.: itemList.append('Fair') else: itemList.append('Good') else: itemList.append('Idle') if len(resultDict[site]) == 1 or expand_site: records.append(itemList) if len(resultDict[site]) > 1 and not expand_site: itemList = [site, 'Multiple'] for state in allStateNames + ['Total']: if sumDict.has_key(state): itemList.append(sumDict[state]) else: itemList.append(0) done = sumDict["Done"] empty = sumDict["Done_Empty"] aborted = sumDict["Aborted"] aborted_hour = sumDict["Aborted_Hour"] total = sumDict["Total"] # Add pilot submission efficiency evaluation if (done - empty) > 0: eff = float(done) / float(done - empty) elif done == 0: eff = 0. elif empty == done: eff = 99. else: eff = 0. itemList.append('%.2f' % eff) # Add pilot job efficiency evaluation if total > 0: eff = float(total - aborted) / float(total) * 100. else: eff = 100. itemList.append('%.2f' % eff) # Evaluate the quality status of the Site if total > 10: if eff < 25.: itemList.append('Bad') elif eff < 60.: itemList.append('Poor') elif eff < 85.: itemList.append('Fair') else: itemList.append('Good') else: itemList.append('Idle') records.append(itemList) for state in allStateNames + ['Total']: if not siteSumDict.has_key(state): siteSumDict[state] = sumDict[state] else: siteSumDict[state] += sumDict[state] # Perform site selection if site_select: new_records = [] for r in records: if r[0] in site_select: new_records.append(r) records = new_records # Perform status selection if status_select: new_records = [] for r in records: if r[14] in status_select: new_records.append(r) records = new_records # Get the Site Mask data client = RPCClient('WorkloadManagement/WMSAdministrator') result = client.getSiteMask() if result['OK']: siteMask = result['Value'] for r in records: if r[0] in siteMask: r.append('Yes') else: r.append('No') else: for r in records: r.append('Unknown') finalDict = {} finalDict['TotalRecords'] = len(records) finalDict['ParameterNames'] = paramNames + \ ['Total', 'PilotsPerJob', 'PilotJobEff', 'Status', 'InMask'] # Return all the records if maxItems == 0 or the specified number otherwise if maxItems: finalDict['Records'] = records[startItem:startItem + maxItems] else: finalDict['Records'] = records done = siteSumDict["Done"] empty = siteSumDict["Done_Empty"] aborted = siteSumDict["Aborted"] aborted_hour = siteSumDict["Aborted_Hour"] total = siteSumDict["Total"] # Add pilot submission efficiency evaluation if (done - empty) > 0: eff = float(done) / float(done - empty) elif done == 0: eff = 0. elif empty == done: eff = 99. else: eff = 0. siteSumDict['PilotsPerJob'] = '%.2f' % eff # Add pilot job efficiency evaluation if total > 0: eff = float(total - aborted) / float(total) * 100. else: eff = 100. siteSumDict['PilotJobEff'] = '%.2f' % eff # Evaluate the overall quality status if total > 100: if eff < 25.: siteSumDict['Status'] = 'Bad' elif eff < 60.: siteSumDict['Status'] = 'Poor' elif eff < 85.: siteSumDict['Status'] = 'Fair' else: siteSumDict['Status'] = 'Good' else: siteSumDict['Status'] = 'Idle' finalDict['Extras'] = siteSumDict return S_OK(finalDict)
def web_getStatisticsData( self ): req = self.__request() paletteColor = Palette() RPC = RPCClient( "WorkloadManagement/WMSAdministrator" ) selector = self.request.arguments["statsField"][0] if selector == 'Site': selector = "GridSite" if selector == "Computing Element": selector = "DestinationSite" elif selector == "Owner Group": selector = "OwnerGroup" elif selector == "Owner": selector = "OwnerDN" result = yield self.threadTask( RPC.getPilotStatistics, selector, req ) if not result['OK']: if 'FromDate' in req: del req['FromDate'] if 'LastUpdate' in req: del req['LastUpdate'] if 'ToDate' in req: del req['ToDate'] result = yield self.threadTask( RPC.getCounters, "PilotAgents", [selector], req ) statistics = {} if result['OK']: for status, count in result['Value']: if "OwnerDN" in status: userName = getUsernameForDN( status['OwnerDN'] ) if userName['OK']: status['OwnerDN'] = userName['Value'] statistics[status[selector]] = count result = S_OK(statistics) if result["OK"]: callback = [] result = dict( result["Value"] ) keylist = result.keys() keylist.sort() if selector == "Site": tier1 = gConfig.getValue( "/WebApp/PreferredSites", [] ) if len( tier1 ) > 0: tier1.sort() for i in tier1: if result.has_key( i ): countryCode = i.rsplit( ".", 1 )[1] callback.append( {"key":i, "value":result[i], "code":countryCode, "color": paletteColor.getColor( countryCode ) } ) for key in keylist: if selector == "Site" and tier1: if key not in tier1: try: countryCode = key.rsplit( ".", 1 )[1] except: countryCode = "Unknown" callback.append( {"key":key, "value":result[key], "code":countryCode, "color": paletteColor.getColor( key ) } ) elif selector == "Site" and not tier1: try: countryCode = key.rsplit( ".", 1 )[1] except: countryCode = "Unknown" callback.append( {"key":key, "value":result[key], "code":countryCode, "color": paletteColor.getColor( key ) } ) else: callback.append( {"key":key, "value":result[key], "code":"", "color": paletteColor.getColor( key ) } ) callback = {"success":"true", "result":callback} else: callback = {"success":"false", "error":result["Message"]} self.finish( callback )
def getTimeLeft(self, cpuConsumed=0.0, processors=1): """Returns the CPU Time Left for supported batch systems. The CPUConsumed is the current raw total CPU. """ # Quit if no norm factor available if not self.cpuPower: return S_ERROR( "/LocalSite/CPUNormalizationFactor not defined for site %s" % DIRAC.siteName()) if not self.batchPlugin: return S_ERROR(self.batchError) resourceDict = self.batchPlugin.getResourceUsage() if not resourceDict["OK"]: self.log.warn( "Could not determine timeleft for batch system at site %s" % DIRAC.siteName()) return resourceDict resources = resourceDict["Value"] self.log.debug("self.batchPlugin.getResourceUsage(): %s" % str(resources)) if not resources.get("CPULimit") and not resources.get( "WallClockLimit"): # This should never happen return S_ERROR("No CPU or WallClock limit obtained") # if one of CPULimit or WallClockLimit is missing, compute a reasonable value if not resources.get("CPULimit"): resources["CPULimit"] = resources["WallClockLimit"] * processors elif not resources.get("WallClockLimit"): resources["WallClockLimit"] = resources["CPULimit"] / processors # if one of CPU or WallClock is missing, compute a reasonable value if not resources.get("CPU"): resources["CPU"] = resources["WallClock"] * processors elif not resources.get("WallClock"): resources["WallClock"] = resources["CPU"] / processors cpu = float(resources["CPU"]) cpuLimit = float(resources["CPULimit"]) wallClock = float(resources["WallClock"]) wallClockLimit = float(resources["WallClockLimit"]) batchSystemTimeUnit = resources.get("Unit", "Both") # Some batch systems rely on wall clock time and/or cpu time to make allocations if batchSystemTimeUnit == "WallClock": time = wallClock timeLimit = wallClockLimit else: time = cpu timeLimit = cpuLimit if time and cpuConsumed > 3600.0 and self.cpuPower: # If there has been more than 1 hour of consumed CPU and # there is a Normalization set for the current CPU # use that value to renormalize the values returned by the batch system # NOTE: cpuConsumed is non-zero for call by the JobAgent and 0 for call by the watchdog # cpuLimit and cpu may be in the units of the batch system, not real seconds... # (in this case the other case won't work) # therefore renormalise it using cpuConsumed (which is in real seconds) cpuWorkLeft = (timeLimit - time) * self.cpuPower * cpuConsumed / time else: # FIXME: this is always used by the watchdog... Also used by the JobAgent # if consumed less than 1 hour of CPU # It was using self.scaleFactor but this is inconsistent: use the same as above # In case the returned cpu and cpuLimit are not in real seconds, this is however rubbish cpuWorkLeft = (timeLimit - time) * self.cpuPower self.log.verbose("Remaining CPU in normalized units is: %.02f" % cpuWorkLeft) return S_OK(cpuWorkLeft)