def registerToServer(self): """ Register type in server """ rpcClient = RPCClient("Accounting/DataStore") return rpcClient.registerType(*self.getDefinition())
def createDirectory(self, path): client = RPCClient(self.url) return client.callProxyMethod(self.name, 'createDirectory', [path], {})
def finalizeRequest(self, requestName, jobID): """ check request status and perform finalization if necessary update the request status and the corresponding job parameter :param self: self reference :param str requestName: request name :param int jobID: job id """ stateServer = RPCClient("WorkloadManagement/JobStateUpdate", useCertificates=True) # Checking if to update the job status - we should fail here, so it will be re-tried later # Checking the state, first res = self.getRequestStatus(requestName) if not res['OK']: self.log.error( "finalizeRequest: failed to get request", "request: %s status: %s" % (requestName, res["Message"])) return res if res["Value"] != "Done": return S_ERROR( "The request %s isn't 'Done' but '%s', this should never happen, why are we here?" % (requestName, res['Value'])) # The request is 'Done', let's update the job status. If we fail, we should re-try later monitorServer = RPCClient("WorkloadManagement/JobMonitoring", useCertificates=True) res = monitorServer.getJobPrimarySummary(int(jobID)) if not res["OK"]: self.log.error("finalizeRequest: Failed to get job status", "JobID: %d" % jobID) return S_ERROR("finalizeRequest: Failed to get job %d status" % jobID) elif not res['Value']: self.log.info( "finalizeRequest: job %d does not exist (anymore): finalizing" % jobID) return S_OK() else: jobStatus = res["Value"]["Status"] jobMinorStatus = res["Value"]["MinorStatus"] # update the job pending request digest in any case since it is modified self.log.info( "finalizeRequest: Updating request digest for job %d" % jobID) digest = self.getDigest(requestName) if digest["OK"]: digest = digest["Value"] self.log.verbose(digest) res = stateServer.setJobParameter(jobID, "PendingRequest", digest) if not res["OK"]: self.log.info( "finalizeRequest: Failed to set job %d parameter: %s" % (jobID, res["Message"])) return res else: self.log.error( "finalizeRequest: Failed to get request digest for %s: %s" % (requestName, digest["Message"])) stateUpdate = None if jobStatus == 'Completed': # What to do? Depends on what we have in the minorStatus if jobMinorStatus == "Pending Requests": self.log.info( "finalizeRequest: Updating job status for %d to Done/Requests done" % jobID) stateUpdate = stateServer.setJobStatus( jobID, "Done", "Requests done", "") elif jobMinorStatus == "Application Finished With Errors": self.log.info( "finalizeRequest: Updating job status for %d to Failed/Requests done" % jobID) stateUpdate = stateServer.setJobStatus( jobID, "Failed", "Requests done", "") if not stateUpdate: self.log.info( "finalizeRequest: Updating job minor status for %d to Requests done (status is %s)" % (jobID, jobStatus)) stateUpdate = stateServer.setJobStatus(jobID, jobStatus, "Requests done", "") if not stateUpdate["OK"]: self.log.error( "finalizeRequest: Failed to set job status", "JobID: %d status: %s" % (jobID, stateUpdate['Message'])) return stateUpdate return S_OK()
def prestageFileStatus(self, path): client = RPCClient(self.url) return client.callProxyMethod(self.name, 'prestageFileStatus', [path], {})
def releaseFile(self, path): client = RPCClient(self.url) return client.callProxyMethod(self.name, 'releaseFile', [path], {})
def execute(self): """Main Agent code: 1.- Query TaskQueueDB for existing TQs 2.- Add their Priorities 3.- Submit pilots """ self.__checkSubmitPools() self.directorDict = getResourceDict() #Add all submit pools self.directorDict['SubmitPool'] = self.am_getOption("SubmitPools") #Add all DIRAC platforms if not specified otherwise if not 'Platform' in self.directorDict: result = getDIRACPlatforms() if result['OK']: self.directorDict['Platform'] = result['Value'] rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(self.directorDict) if not result['OK']: self.log.error('Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] self.log.info('Found %s TaskQueues' % len(taskQueueDict)) if not taskQueueDict: self.log.info('No TaskQueue to Process') return S_OK() prioritySum = 0 waitingJobs = 0 for taskQueueID in taskQueueDict: taskQueueDict[taskQueueID]['TaskQueueID'] = taskQueueID prioritySum += taskQueueDict[taskQueueID]['Priority'] waitingJobs += taskQueueDict[taskQueueID]['Jobs'] self.log.info('Sum of Priorities %s' % prioritySum) if waitingJobs == 0: self.log.info('No waiting Jobs') return S_OK('No waiting Jobs') if prioritySum <= 0: return S_ERROR('Wrong TaskQueue Priorities') self.pilotsPerPriority = self.am_getOption( 'pilotsPerIteration') / prioritySum self.pilotsPerJob = self.am_getOption( 'pilotsPerIteration') / waitingJobs self.callBackLock.acquire() self.submittedPilots = 0 self.callBackLock.release() self.toSubmitPilots = 0 waitingStatusList = ['Submitted', 'Ready', 'Scheduled', 'Waiting'] timeLimitToConsider = Time.toString( Time.dateTime() - Time.hour * self.am_getOption("maxPilotWaitingHours")) for taskQueueID in taskQueueDict: self.log.verbose('Processing TaskQueue', taskQueueID) result = pilotAgentsDB.countPilots( { 'TaskQueueID': taskQueueID, 'Status': waitingStatusList }, None, timeLimitToConsider) if not result['OK']: self.log.error('Fail to get Number of Waiting pilots', result['Message']) waitingPilots = 0 else: waitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % taskQueueID, waitingPilots) result = self.submitPilotsForTaskQueue(taskQueueDict[taskQueueID], waitingPilots) if result['OK']: self.toSubmitPilots += result['Value'] self.log.info('Number of pilots to be Submitted %s' % self.toSubmitPilots) # Now wait until all Jobs in the Default ThreadPool are proccessed if 'Default' in self.pools: # only for those in "Default' thread Pool # for pool in self.pools: self.pools['Default'].processAllResults() self.log.info('Number of pilots Submitted %s' % self.submittedPilots) return S_OK()
def exists(self, path): client = RPCClient(self.url) return client.callProxyMethod(self.name, 'exists', [path], {})
def export_sendHeartBeat(self, jobID, dynamicData, staticData): """ Send a heart beat sign of life for a job jobID """ jobReport = RPCClient('WorkloadManagement/JobStateUpdate', timeout=120) result = jobReport.sendHeartBeat(jobID, dynamicData, staticData) return result
def export_setJobForPilot(self, jobID, pilotRef, destination=None): """ Report the DIRAC job ID which is executed by the given pilot job """ wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.setJobForPilot(jobID, pilotRef, destination) return result
def getComponentsStatus(self, condDict): rpcClient = RPCClient("Framework/Monitoring", timeout=100) return rpcClient.getComponentsStatus(condDict)
def export_setJobSite(self, jobID, site): """Allows the site attribute to be set for a job specified by its jobID. """ jobReport = RPCClient('WorkloadManagement/JobStateUpdate') jobSite = jobReport.setJobSite(jobID, site) return jobSite
def __getRPCClient(self): if self.rpcClient: return self.rpcClient return RPCClient(self.serviceName)
os.environ['X509_USER_PROXY'] = proxy print("*INFO* using proxy %s" % proxy) print "*********************************************************************************************************" print "Execution at : '" + time.strftime('%d/%m/%y %H:%M', time.localtime()) + "'" try: d = Dirac() except AttributeError: sys.stderr.write( time.strftime('%d/%m/%y %H:%M', time.localtime()) + " => Error loading Dirac monitor\n") raise Exception("Error loading Dirac monitor") w = RPCClient("WorkloadManagement/JobMonitoring") delTime = str(Time.dateTime() - delay_job_handled * Time.day) jobid_handled = [] file_jobhandled = open(filename_jobhandled, "r") for line in file_jobhandled: try: jobid_handled.append(str(int(line))) except ValueError: print "WARNING : the file '" + filename_jobhandled + "' contains a NaN line : '" + line + "'" file_jobhandled.close() my_dict = {} my_dict['OwnerDN'] = [
def __getRPCClient(self): if self.__rpcClient: return self.__rpcClient else: return RPCClient(self.__serviceName, **self.__kwargs)
def doCommand(self): """ Return getQuality from DIRAC's accounting ReportsClient `args`: a tuple - args[0]: string: should be a ValidRes - args[1]: string should be the name of the ValidRes - args[2]: optional dateTime object: a "from" date - args[3]: optional dateTime object: a "to" date :returns: {'Result': None | a float between 0.0 and 100.0} """ super(TransferQuality_Command, self).doCommand() if self.RPC is None: from DIRAC.Core.DISET.RPCClient import RPCClient self.RPC = RPCClient("Accounting/ReportGenerator", timeout=self.timeout) if self.client is None: from DIRAC.AccountingSystem.Client.ReportsClient import ReportsClient self.client = ReportsClient(rpcClient=self.RPC) try: if self.args[2] is None: fromD = datetime.datetime.utcnow() - datetime.timedelta( hours=2) else: fromD = self.args[2] except: fromD = datetime.datetime.utcnow() - datetime.timedelta(hours=2) try: if self.args[3] is None: toD = datetime.datetime.utcnow() else: toD = self.args[3] except: toD = datetime.datetime.utcnow() try: pr_quality = self.client.getReport( 'DataOperation', 'Quality', fromD, toD, { 'OperationType': 'putAndRegister', 'Destination': [self.args[1]] }, 'Channel') if not pr_quality['OK']: raise RSSException, where( self, self.doCommand) + " " + pr_quality['Message'] except: gLogger.exception( "Exception when calling ReportsClient for %s %s" % (self.args[0], self.args[1])) return {'Result': 'Unknown'} pr_q_d = pr_quality['Value']['data'] if pr_q_d == {}: return {'Result': None} else: if len(pr_q_d) == 1: values = [] for k in pr_q_d.keys(): for n in pr_q_d[k].values(): values.append(n) return {'Result': sum(values) / len(values)} else: values = [] for n in pr_q_d['Total'].values(): values.append(n) return {'Result': sum(values) / len(values)}
def export_setPilotBenchmark(self, pilotRef, mark): """ Set the pilot agent benchmark """ wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.setPilotBenchmark(pilotRef, mark) return result
def doCommand(self): """ Returns jobs accounting info for sites in the last 24h `args`: - args[0]: string - should be a ValidRes - args[1]: string - should be the name of the ValidRes - args[2]: string - should be 'Job' or 'Pilot' or 'DataOperation' or 'WMSHistory' (??) or 'SRM' (??) - args[3]: string - should be the plot to generate (e.g. CPUEfficiency) - args[4]: dictionary - e.g. {'Format': 'LastHours', 'hours': 24} - args[5]: string - should be the grouping - args[6]: dictionary - optional conditions """ super(DIRACAccounting_Command, self).doCommand() if self.RPC is None: from DIRAC.Core.DISET.RPCClient import RPCClient self.RPC = RPCClient("Accounting/ReportGenerator", timeout=self.timeout) if self.client is None: from DIRAC.AccountingSystem.Client.ReportsClient import ReportsClient self.client = ReportsClient(rpcClient=self.RPC) granularity = self.args[0] name = self.args[1] accounting = self.args[2] plot = self.args[3] period = self.args[4] if period['Format'] == 'LastHours': fromT = datetime.datetime.utcnow() - datetime.timedelta( hours=period['hours']) toT = datetime.datetime.utcnow() elif period['Format'] == 'Periods': #TODO pass grouping = self.args[5] try: if self.args[6] is not None: conditions = self.args[6] else: raise Exception except: conditions = {} if accounting == 'Job' or accounting == 'Pilot': if granularity == 'Resource': conditions['GridCE'] = [name] elif granularity == 'Service': conditions['Site'] = [name.split('@').pop()] elif granularity == 'Site': conditions['Site'] = [name] else: raise InvalidRes, where(self, self.doCommand) elif accounting == 'DataOperation': conditions['Destination'] = [name] try: res = self.client.getReport(accounting, plot, fromT, toT, conditions, grouping) if res['OK']: return {'Result': res['Value']} else: raise RSSException, where( self, self.doCommand) + ' ' + res['Message'] except: gLogger.exception("Exception when calling ReportsClient for " + granularity + " " + name) return {'Result': 'Unknown'}
def export_getJobParameter(jobID, parName): monitoring = RPCClient('WorkloadManagement/JobMonitoring', timeout=120) result = monitoring.getJobParameter(jobID, parName) return result
def __call__(self): """ request processing """ self.log.debug("about to execute request") gMonitor.addMark("RequestAtt", 1) # # setup proxy for request owner setupProxy = self.setupProxy() if not setupProxy["OK"]: self.request.Error = setupProxy["Message"] if 'has no proxy registered' in setupProxy["Message"]: self.log.error('Error setting proxy. Request set to Failed:', setupProxy["Message"]) # If user is no longer registered, fail the request for operation in self.request: for opFile in operation: opFile.Status = 'Failed' operation.Status = 'Failed' else: self.log.error("Error setting proxy", setupProxy["Message"]) return S_OK(self.request) shifter = setupProxy["Value"]["Shifter"] proxyFile = setupProxy["Value"]["ProxyFile"] error = None while self.request.Status == "Waiting": # # get waiting operation operation = self.request.getWaiting() if not operation["OK"]: self.log.error("Cannot get waiting operation", operation["Message"]) return operation operation = operation["Value"] self.log.info("executing operation", "%s" % operation.Type) # # and handler for it handler = self.getHandler(operation) if not handler["OK"]: self.log.error("Unable to process operation", "%s: %s" % (operation.Type, handler["Message"])) # gMonitor.addMark( "%s%s" % ( operation.Type, "Fail" ), 1 ) operation.Error = handler["Message"] break handler = handler["Value"] # # set shifters list in the handler handler.shifter = shifter # # and execute pluginName = self.getPluginName( self.handlersDict.get(operation.Type)) if self.standalone: useServerCertificate = gConfig.useServerCertificate() else: # Always use server certificates if executed within an agent useServerCertificate = True try: if pluginName: gMonitor.addMark("%s%s" % (pluginName, "Att"), 1) # Always use request owner proxy if useServerCertificate: gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false') exe = handler() if useServerCertificate: gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true') if not exe["OK"]: self.log.error("unable to process operation", "%s: %s" % (operation.Type, exe["Message"])) if pluginName: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) gMonitor.addMark("RequestFail", 1) if self.request.JobID: # Check if the job exists monitorServer = RPCClient( "WorkloadManagement/JobMonitoring", useCertificates=True) res = monitorServer.getJobPrimarySummary( int(self.request.JobID)) if not res["OK"]: self.log.error( "RequestTask: Failed to get job status", "%d" % self.request.JobID) elif not res['Value']: self.log.warn( "RequestTask: job does not exist (anymore): failed request", "JobID: %d" % self.request.JobID) for opFile in operation: opFile.Status = 'Failed' if operation.Status != 'Failed': operation.Status = 'Failed' self.request.Error = 'Job no longer exists' except Exception as error: self.log.exception("hit by exception:", "%s" % error) if pluginName: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) gMonitor.addMark("RequestFail", 1) if useServerCertificate: gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true') break # # operation status check if operation.Status == "Done" and pluginName: gMonitor.addMark("%s%s" % (pluginName, "OK"), 1) elif operation.Status == "Failed" and pluginName: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) elif operation.Status in ("Waiting", "Scheduled"): # # no update for waiting or all files scheduled break gMonitor.flush() if error: return S_ERROR(error) # # request done? if self.request.Status == "Done": # # update request to the RequestDB self.log.info("Updating request status:", "%s" % self.request.Status) update = self.updateRequest() if not update["OK"]: self.log.error("Cannot update request status", update["Message"]) return update self.log.info("request is done", "%s" % self.request.RequestName) gMonitor.addMark("RequestOK", 1) # # and there is a job waiting for it? finalize! if self.request.JobID: attempts = 0 while True: finalizeRequest = self.requestClient.finalizeRequest( self.request.RequestID, self.request.JobID) # pylint: disable=no-member if not finalizeRequest["OK"]: if not attempts: self.log.error( "unable to finalize request, will retry", "ReqName %s:%s" % (self.request.RequestName, finalizeRequest["Message"])) self.log.debug("Waiting 10 seconds") attempts += 1 if attempts == 10: self.log.error("Giving up finalize request") return S_ERROR('Could not finalize request') time.sleep(10) else: self.log.info( "request is finalized", "ReqName %s %s" % (self.request.RequestName, (' after %d attempts' % attempts) if attempts else '')) break # Request will be updated by the callBack method self.log.verbose("RequestTasks exiting", "request %s" % self.request.Status) return S_OK(self.request)
def export_commitRegisters(self, entriesList): acc = RPCClient('Accounting/DataStore') retVal = acc.commitRegisters(entriesList) return retVal
def getFileMetadata(self, path): client = RPCClient(self.url) return client.callProxyMethod(self.name, 'getFileMetadata', [path], {})
def __requestJob(self, ceDict): """Request a single job from the matcher service. """ matcher = RPCClient('WorkloadManagement/Matcher', timeout=600) return matcher.requestJob(ceDict)
def pinFile(self, path, lifetime=60 * 60 * 24): client = RPCClient(self.url) return client.callProxyMethod(self.name, 'pinFile', [path], {'lifetime': lifetime})
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ queues = self.queueDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for queue in queues: tags += self.queueDict[queue]['ParametersDict']['Tag'] tqDict['Tag'] = list(set(tags)) self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() self.log.info(tqIDList) result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None) tagWaitingPilots = 0 if result['OK']: tagWaitingPilots = result['Value'] self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len(tqIDList), tagWaitingPilots)) self.log.info('Queues: ', self.queueDict.keys()) # if tagWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() result = self.siteClient.getUsableSites() if not result['OK']: return result siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle(queues) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor if failedCount != 0: self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount)) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] platform = self.queueDict[queue]['Platform'] queueTags = self.queueDict[queue]['ParametersDict']['Tag'] siteMask = siteName in siteMaskList processorTags = [] # Check the status of the Site result = self.siteClient.getUsableSites(siteName) if not result['OK']: self.log.error("Can not get the status of site %s: %s" % (siteName, result['Message'])) continue if siteName not in result.get('Value', []): self.log.info("site %s is not active" % siteName) continue if self.rssFlag: # Check the status of the ComputingElement result = self.rssClient.getElementStatus( ceName, "ComputingElement") if not result['OK']: self.log.error( "Can not get the status of computing element", " %s: %s" % (siteName, result['Message'])) continue if result['Value']: # get the value of the status result = result['Value'][ceName]['all'] if result not in ('Active', 'Degraded'): self.log.verbose( "Skipping computing element %s at %s: resource not usable" % (ceName, siteName)) continue for tag in queueTags: if re.match(r'^[0-9]+Processors$', tag): processorTags.append(tag) if 'WholeNode' in queueTags: processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (queueName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose( "Skipping queue %s at site %s not in the mask" % (queueName, siteName)) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName # if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = queueTags # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % queue) continue matchedQueues += 1 totalTQJobs = 0 totalTQJobsByProcessors = {} tqIDList = taskQueueDict.keys() tqIDListByProcessors = {} for tq in taskQueueDict: if 'Tags' not in taskQueueDict[tq]: # skip non multiprocessor tqs continue for tag in taskQueueDict[tq]['Tags']: if tag in processorTags: tqIDListByProcessors.setdefault(tag, []) tqIDListByProcessors[tag].append(tq) totalTQJobsByProcessors.setdefault(tag, 0) totalTQJobsByProcessors[tag] += taskQueueDict[tq][ 'Jobs'] totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), queue)) queueSubmittedPilots = 0 for tag in tqIDListByProcessors: self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" % (tag, tqIDListByProcessors[tag])) processors = 1 m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag) if m: processors = int(m.group('processors')) if tag == 'WholeNode': processors = -1 tagTQJobs = totalTQJobsByProcessors[tag] tagTqIDList = tqIDListByProcessors[tag] # Get the number of already waiting pilots for these task queues tagWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime( ) - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tagTqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message']) tagWaitingPilots = 0 else: tagWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots) if tagWaitingPilots >= tagTQJobs: self.log.verbose( "%d waiting pilots already for all the available jobs" % tagWaitingPilots) continue self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % (tagWaitingPilots, tagTQJobs, queue)) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue totalSlots = self.getQueueSlots(queue, False) if totalSlots == 0: self.log.debug('%s: No slots available' % queue) continue # Note: comparing slots to job numbers is not accurate in multiprocessor case. # This could lead to over submission. pilotsToSubmit = max( 0, min(totalSlots, tagTQJobs - tagWaitingPilots)) self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \ ( queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False) jobExecDir = '' jobExecDir = self.queueDict[queue]['ParametersDict'].get( 'JobExecDir', jobExecDir) httpProxy = self.queueDict[queue]['ParametersDict'].get( 'HttpProxy', '') result = self.getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir, processors=processors) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob(executable, '', pilotSubmissionChunk, processors=processors) # ## FIXME: The condor thing only transfers the file with some # ## delay, so when we unlink here the script is gone # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts... if ceType != 'HTCondorCE': os.unlink(executable) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk queueSubmittedPilots += pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len(pilotList) totalSubmittedPilots += len(pilotList) self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if result.has_key('PilotStampDict'): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in tagTqIDList: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()
def getDirectorySize(self, path): client = RPCClient(self.url) return client.callProxyMethod(self.name, 'getDirectorySize', [path], {})
def __getRPCClient(self): if self.getRPCClient: return self.getRPCClient("Framework/SiteMap") return RPCClient("Framework/SiteMap")
def removeDirectory(self, path, recursive=False): client = RPCClient(self.url) return client.callProxyMethod(self.name, 'removeDirectory', [path], {'recursive': recursive})
jobID = int(jobID) except Exception, x: return self._errorReport( str(x), 'Expected integer or convertible integer for existing jobID' ) elif type(jobID) == type([]): try: jobID = [int(job) for job in jobID] except Exception, x: return self._errorReport( str(x), 'Expected integer or convertible integer for existing jobIDs' ) jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=False) result = jobManager.resetJob(jobID) return result ############################################################################# def getJobPilotOutput(self, jobID, directory=''): """Retrieve the pilot output for an existing job in the WMS. The output will be retrieved in a local directory unless otherwise specified. >>> print dirac.getJobPilotOutput(12345) {'OK': True, StdOut:'',StdError:''} @param job: JobID @type job: integer or string @return: S_OK,S_ERROR
def __getRPCClient(self): """ Get an RPC client for SB service """ if self.__rpcClient: return self.__rpcClient else: return RPCClient(self.__serviceName, **self.__kwargs)
def requestManager(cls): """ get request manager """ if not cls.__requestManager: cls.__requestManager = RPCClient("RequestManagement/ReqManager") return cls.__requestManager