コード例 #1
0
 def registerToServer(self):
   """
   Register type in server
   """
   rpcClient = RPCClient("Accounting/DataStore")
   return rpcClient.registerType(*self.getDefinition())
コード例 #2
0
ファイル: ProxyStorage.py プロジェクト: rob-c/DIRAC
 def createDirectory(self, path):
     client = RPCClient(self.url)
     return client.callProxyMethod(self.name, 'createDirectory', [path], {})
コード例 #3
0
ファイル: ReqClient.py プロジェクト: ptakha/DIRAC-1
    def finalizeRequest(self, requestName, jobID):
        """ check request status and perform finalization if necessary
        update the request status and the corresponding job parameter

    :param self: self reference
    :param str requestName: request name
    :param int jobID: job id
    """
        stateServer = RPCClient("WorkloadManagement/JobStateUpdate",
                                useCertificates=True)

        # Checking if to update the job status - we should fail here, so it will be re-tried later
        # Checking the state, first
        res = self.getRequestStatus(requestName)
        if not res['OK']:
            self.log.error(
                "finalizeRequest: failed to get request",
                "request: %s status: %s" % (requestName, res["Message"]))
            return res
        if res["Value"] != "Done":
            return S_ERROR(
                "The request %s isn't 'Done' but '%s', this should never happen, why are we here?"
                % (requestName, res['Value']))

        # The request is 'Done', let's update the job status. If we fail, we should re-try later
        monitorServer = RPCClient("WorkloadManagement/JobMonitoring",
                                  useCertificates=True)
        res = monitorServer.getJobPrimarySummary(int(jobID))
        if not res["OK"]:
            self.log.error("finalizeRequest: Failed to get job status",
                           "JobID: %d" % jobID)
            return S_ERROR("finalizeRequest: Failed to get job %d status" %
                           jobID)
        elif not res['Value']:
            self.log.info(
                "finalizeRequest: job %d does not exist (anymore): finalizing"
                % jobID)
            return S_OK()
        else:
            jobStatus = res["Value"]["Status"]
            jobMinorStatus = res["Value"]["MinorStatus"]

            # update the job pending request digest in any case since it is modified
            self.log.info(
                "finalizeRequest: Updating request digest for job %d" % jobID)

            digest = self.getDigest(requestName)
            if digest["OK"]:
                digest = digest["Value"]
                self.log.verbose(digest)
                res = stateServer.setJobParameter(jobID, "PendingRequest",
                                                  digest)
                if not res["OK"]:
                    self.log.info(
                        "finalizeRequest: Failed to set job %d parameter: %s" %
                        (jobID, res["Message"]))
                    return res
            else:
                self.log.error(
                    "finalizeRequest: Failed to get request digest for %s: %s"
                    % (requestName, digest["Message"]))
            stateUpdate = None
            if jobStatus == 'Completed':
                # What to do? Depends on what we have in the minorStatus
                if jobMinorStatus == "Pending Requests":
                    self.log.info(
                        "finalizeRequest: Updating job status for %d to Done/Requests done"
                        % jobID)
                    stateUpdate = stateServer.setJobStatus(
                        jobID, "Done", "Requests done", "")

                elif jobMinorStatus == "Application Finished With Errors":
                    self.log.info(
                        "finalizeRequest: Updating job status for %d to Failed/Requests done"
                        % jobID)
                    stateUpdate = stateServer.setJobStatus(
                        jobID, "Failed", "Requests done", "")

            if not stateUpdate:
                self.log.info(
                    "finalizeRequest: Updating job minor status for %d to Requests done (status is %s)"
                    % (jobID, jobStatus))
                stateUpdate = stateServer.setJobStatus(jobID, jobStatus,
                                                       "Requests done", "")

            if not stateUpdate["OK"]:
                self.log.error(
                    "finalizeRequest: Failed to set job status",
                    "JobID: %d status: %s" % (jobID, stateUpdate['Message']))
                return stateUpdate

        return S_OK()
コード例 #4
0
ファイル: ProxyStorage.py プロジェクト: rob-c/DIRAC
 def prestageFileStatus(self, path):
     client = RPCClient(self.url)
     return client.callProxyMethod(self.name, 'prestageFileStatus', [path],
                                   {})
コード例 #5
0
ファイル: ProxyStorage.py プロジェクト: rob-c/DIRAC
 def releaseFile(self, path):
     client = RPCClient(self.url)
     return client.callProxyMethod(self.name, 'releaseFile', [path], {})
コード例 #6
0
    def execute(self):
        """Main Agent code:
      1.- Query TaskQueueDB for existing TQs
      2.- Add their Priorities
      3.- Submit pilots
    """

        self.__checkSubmitPools()

        self.directorDict = getResourceDict()
        #Add all submit pools
        self.directorDict['SubmitPool'] = self.am_getOption("SubmitPools")
        #Add all DIRAC platforms if not specified otherwise
        if not 'Platform' in self.directorDict:
            result = getDIRACPlatforms()
            if result['OK']:
                self.directorDict['Platform'] = result['Value']

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(self.directorDict)
        if not result['OK']:
            self.log.error('Could not retrieve TaskQueues from TaskQueueDB',
                           result['Message'])
            return result
        taskQueueDict = result['Value']

        self.log.info('Found %s TaskQueues' % len(taskQueueDict))

        if not taskQueueDict:
            self.log.info('No TaskQueue to Process')
            return S_OK()

        prioritySum = 0
        waitingJobs = 0
        for taskQueueID in taskQueueDict:
            taskQueueDict[taskQueueID]['TaskQueueID'] = taskQueueID
            prioritySum += taskQueueDict[taskQueueID]['Priority']
            waitingJobs += taskQueueDict[taskQueueID]['Jobs']

        self.log.info('Sum of Priorities %s' % prioritySum)

        if waitingJobs == 0:
            self.log.info('No waiting Jobs')
            return S_OK('No waiting Jobs')
        if prioritySum <= 0:
            return S_ERROR('Wrong TaskQueue Priorities')

        self.pilotsPerPriority = self.am_getOption(
            'pilotsPerIteration') / prioritySum
        self.pilotsPerJob = self.am_getOption(
            'pilotsPerIteration') / waitingJobs

        self.callBackLock.acquire()
        self.submittedPilots = 0
        self.callBackLock.release()
        self.toSubmitPilots = 0
        waitingStatusList = ['Submitted', 'Ready', 'Scheduled', 'Waiting']
        timeLimitToConsider = Time.toString(
            Time.dateTime() -
            Time.hour * self.am_getOption("maxPilotWaitingHours"))

        for taskQueueID in taskQueueDict:
            self.log.verbose('Processing TaskQueue', taskQueueID)

            result = pilotAgentsDB.countPilots(
                {
                    'TaskQueueID': taskQueueID,
                    'Status': waitingStatusList
                }, None, timeLimitToConsider)
            if not result['OK']:
                self.log.error('Fail to get Number of Waiting pilots',
                               result['Message'])
                waitingPilots = 0
            else:
                waitingPilots = result['Value']
                self.log.verbose(
                    'Waiting Pilots for TaskQueue %s:' % taskQueueID,
                    waitingPilots)

            result = self.submitPilotsForTaskQueue(taskQueueDict[taskQueueID],
                                                   waitingPilots)

            if result['OK']:
                self.toSubmitPilots += result['Value']

        self.log.info('Number of pilots to be Submitted %s' %
                      self.toSubmitPilots)

        # Now wait until all Jobs in the Default ThreadPool are proccessed
        if 'Default' in self.pools:
            # only for those in "Default' thread Pool
            # for pool in self.pools:
            self.pools['Default'].processAllResults()

        self.log.info('Number of pilots Submitted %s' % self.submittedPilots)

        return S_OK()
コード例 #7
0
ファイル: ProxyStorage.py プロジェクト: rob-c/DIRAC
 def exists(self, path):
     client = RPCClient(self.url)
     return client.callProxyMethod(self.name, 'exists', [path], {})
コード例 #8
0
 def export_sendHeartBeat(self, jobID, dynamicData, staticData):
     """ Send a heart beat sign of life for a job jobID
 """
     jobReport = RPCClient('WorkloadManagement/JobStateUpdate', timeout=120)
     result = jobReport.sendHeartBeat(jobID, dynamicData, staticData)
     return result
コード例 #9
0
 def export_setJobForPilot(self, jobID, pilotRef, destination=None):
     """ Report the DIRAC job ID which is executed by the given pilot job
 """
     wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
     result = wmsAdmin.setJobForPilot(jobID, pilotRef, destination)
     return result
コード例 #10
0
ファイル: MonitoringClient.py プロジェクト: acasajus/DIRAC
 def getComponentsStatus(self, condDict):
     rpcClient = RPCClient("Framework/Monitoring", timeout=100)
     return rpcClient.getComponentsStatus(condDict)
コード例 #11
0
 def export_setJobSite(self, jobID, site):
     """Allows the site attribute to be set for a job specified by its jobID.
 """
     jobReport = RPCClient('WorkloadManagement/JobStateUpdate')
     jobSite = jobReport.setJobSite(jobID, site)
     return jobSite
コード例 #12
0
 def __getRPCClient(self):
     if self.rpcClient:
         return self.rpcClient
     return RPCClient(self.serviceName)
コード例 #13
0
    os.environ['X509_USER_PROXY'] = proxy
    print("*INFO* using proxy %s" % proxy)

    print "*********************************************************************************************************"
    print "Execution at : '" + time.strftime('%d/%m/%y %H:%M',
                                             time.localtime()) + "'"

    try:
        d = Dirac()
    except AttributeError:
        sys.stderr.write(
            time.strftime('%d/%m/%y %H:%M', time.localtime()) +
            " => Error loading Dirac monitor\n")
        raise Exception("Error loading Dirac monitor")

    w = RPCClient("WorkloadManagement/JobMonitoring")

    delTime = str(Time.dateTime() - delay_job_handled * Time.day)

    jobid_handled = []

    file_jobhandled = open(filename_jobhandled, "r")
    for line in file_jobhandled:
        try:
            jobid_handled.append(str(int(line)))
        except ValueError:
            print "WARNING : the file '" + filename_jobhandled + "' contains a NaN line : '" + line + "'"
    file_jobhandled.close()

    my_dict = {}
    my_dict['OwnerDN'] = [
コード例 #14
0
ファイル: SandboxStoreClient.py プロジェクト: acasajus/DIRAC
 def __getRPCClient(self):
     if self.__rpcClient:
         return self.__rpcClient
     else:
         return RPCClient(self.__serviceName, **self.__kwargs)
コード例 #15
0
    def doCommand(self):
        """ 
    Return getQuality from DIRAC's accounting ReportsClient
    
    `args`: a tuple
      - args[0]: string: should be a ValidRes

      - args[1]: string should be the name of the ValidRes

      - args[2]: optional dateTime object: a "from" date
    
      - args[3]: optional dateTime object: a "to" date
      
    :returns:
      {'Result': None | a float between 0.0 and 100.0}
    """
        super(TransferQuality_Command, self).doCommand()

        if self.RPC is None:
            from DIRAC.Core.DISET.RPCClient import RPCClient
            self.RPC = RPCClient("Accounting/ReportGenerator",
                                 timeout=self.timeout)

        if self.client is None:
            from DIRAC.AccountingSystem.Client.ReportsClient import ReportsClient
            self.client = ReportsClient(rpcClient=self.RPC)

        try:
            if self.args[2] is None:
                fromD = datetime.datetime.utcnow() - datetime.timedelta(
                    hours=2)
            else:
                fromD = self.args[2]
        except:
            fromD = datetime.datetime.utcnow() - datetime.timedelta(hours=2)
        try:
            if self.args[3] is None:
                toD = datetime.datetime.utcnow()
            else:
                toD = self.args[3]
        except:
            toD = datetime.datetime.utcnow()

        try:
            pr_quality = self.client.getReport(
                'DataOperation', 'Quality', fromD, toD, {
                    'OperationType': 'putAndRegister',
                    'Destination': [self.args[1]]
                }, 'Channel')

            if not pr_quality['OK']:
                raise RSSException, where(
                    self, self.doCommand) + " " + pr_quality['Message']

        except:
            gLogger.exception(
                "Exception when calling ReportsClient for %s %s" %
                (self.args[0], self.args[1]))
            return {'Result': 'Unknown'}

        pr_q_d = pr_quality['Value']['data']

        if pr_q_d == {}:
            return {'Result': None}
        else:
            if len(pr_q_d) == 1:
                values = []
                for k in pr_q_d.keys():
                    for n in pr_q_d[k].values():
                        values.append(n)
                return {'Result': sum(values) / len(values)}
            else:
                values = []
                for n in pr_q_d['Total'].values():
                    values.append(n)
                return {'Result': sum(values) / len(values)}
コード例 #16
0
 def export_setPilotBenchmark(self, pilotRef, mark):
     """ Set the pilot agent benchmark
 """
     wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
     result = wmsAdmin.setPilotBenchmark(pilotRef, mark)
     return result
コード例 #17
0
    def doCommand(self):
        """ 
    Returns jobs accounting info for sites in the last 24h
    `args`: 
       - args[0]: string - should be a ValidRes
       
       - args[1]: string - should be the name of the ValidRes
       
       - args[2]: string - should be 'Job' or 'Pilot' or 'DataOperation'
         or 'WMSHistory' (??) or 'SRM' (??)
       
       - args[3]: string - should be the plot to generate (e.g. CPUEfficiency) 
       
       - args[4]: dictionary - e.g. {'Format': 'LastHours', 'hours': 24}
       
       - args[5]: string - should be the grouping
       
       - args[6]: dictionary - optional conditions
    """
        super(DIRACAccounting_Command, self).doCommand()

        if self.RPC is None:
            from DIRAC.Core.DISET.RPCClient import RPCClient
            self.RPC = RPCClient("Accounting/ReportGenerator",
                                 timeout=self.timeout)

        if self.client is None:
            from DIRAC.AccountingSystem.Client.ReportsClient import ReportsClient
            self.client = ReportsClient(rpcClient=self.RPC)

        granularity = self.args[0]
        name = self.args[1]
        accounting = self.args[2]
        plot = self.args[3]
        period = self.args[4]
        if period['Format'] == 'LastHours':
            fromT = datetime.datetime.utcnow() - datetime.timedelta(
                hours=period['hours'])
            toT = datetime.datetime.utcnow()
        elif period['Format'] == 'Periods':
            #TODO
            pass
        grouping = self.args[5]
        try:
            if self.args[6] is not None:
                conditions = self.args[6]
            else:
                raise Exception
        except:
            conditions = {}
            if accounting == 'Job' or accounting == 'Pilot':
                if granularity == 'Resource':
                    conditions['GridCE'] = [name]
                elif granularity == 'Service':
                    conditions['Site'] = [name.split('@').pop()]
                elif granularity == 'Site':
                    conditions['Site'] = [name]
                else:
                    raise InvalidRes, where(self, self.doCommand)
            elif accounting == 'DataOperation':
                conditions['Destination'] = [name]

        try:

            res = self.client.getReport(accounting, plot, fromT, toT,
                                        conditions, grouping)

            if res['OK']:
                return {'Result': res['Value']}
            else:
                raise RSSException, where(
                    self, self.doCommand) + ' ' + res['Message']

        except:
            gLogger.exception("Exception when calling ReportsClient for " +
                              granularity + " " + name)
            return {'Result': 'Unknown'}
コード例 #18
0
 def export_getJobParameter(jobID, parName):
     monitoring = RPCClient('WorkloadManagement/JobMonitoring', timeout=120)
     result = monitoring.getJobParameter(jobID, parName)
     return result
コード例 #19
0
ファイル: RequestTask.py プロジェクト: sanguillon/DIRAC
    def __call__(self):
        """ request processing """

        self.log.debug("about to execute request")
        gMonitor.addMark("RequestAtt", 1)

        # # setup proxy for request owner
        setupProxy = self.setupProxy()
        if not setupProxy["OK"]:
            self.request.Error = setupProxy["Message"]
            if 'has no proxy registered' in setupProxy["Message"]:
                self.log.error('Error setting proxy. Request set to Failed:',
                               setupProxy["Message"])
                # If user is no longer registered, fail the request
                for operation in self.request:
                    for opFile in operation:
                        opFile.Status = 'Failed'
                    operation.Status = 'Failed'
            else:
                self.log.error("Error setting proxy", setupProxy["Message"])
            return S_OK(self.request)
        shifter = setupProxy["Value"]["Shifter"]
        proxyFile = setupProxy["Value"]["ProxyFile"]

        error = None
        while self.request.Status == "Waiting":

            # # get waiting operation
            operation = self.request.getWaiting()
            if not operation["OK"]:
                self.log.error("Cannot get waiting operation",
                               operation["Message"])
                return operation
            operation = operation["Value"]
            self.log.info("executing operation", "%s" % operation.Type)

            # # and handler for it
            handler = self.getHandler(operation)
            if not handler["OK"]:
                self.log.error("Unable to process operation",
                               "%s: %s" % (operation.Type, handler["Message"]))
                # gMonitor.addMark( "%s%s" % ( operation.Type, "Fail" ), 1 )
                operation.Error = handler["Message"]
                break

            handler = handler["Value"]
            # # set shifters list in the handler
            handler.shifter = shifter
            # # and execute
            pluginName = self.getPluginName(
                self.handlersDict.get(operation.Type))
            if self.standalone:
                useServerCertificate = gConfig.useServerCertificate()
            else:
                # Always use server certificates if executed within an agent
                useServerCertificate = True
            try:
                if pluginName:
                    gMonitor.addMark("%s%s" % (pluginName, "Att"), 1)
                # Always use request owner proxy
                if useServerCertificate:
                    gConfigurationData.setOptionInCFG(
                        '/DIRAC/Security/UseServerCertificate', 'false')
                exe = handler()
                if useServerCertificate:
                    gConfigurationData.setOptionInCFG(
                        '/DIRAC/Security/UseServerCertificate', 'true')
                if not exe["OK"]:
                    self.log.error("unable to process operation",
                                   "%s: %s" % (operation.Type, exe["Message"]))
                    if pluginName:
                        gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1)
                    gMonitor.addMark("RequestFail", 1)
                    if self.request.JobID:
                        # Check if the job exists
                        monitorServer = RPCClient(
                            "WorkloadManagement/JobMonitoring",
                            useCertificates=True)
                        res = monitorServer.getJobPrimarySummary(
                            int(self.request.JobID))
                        if not res["OK"]:
                            self.log.error(
                                "RequestTask: Failed to get job status",
                                "%d" % self.request.JobID)
                        elif not res['Value']:
                            self.log.warn(
                                "RequestTask: job does not exist (anymore): failed request",
                                "JobID: %d" % self.request.JobID)
                            for opFile in operation:
                                opFile.Status = 'Failed'
                            if operation.Status != 'Failed':
                                operation.Status = 'Failed'
                            self.request.Error = 'Job no longer exists'
            except Exception as error:
                self.log.exception("hit by exception:", "%s" % error)
                if pluginName:
                    gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1)
                gMonitor.addMark("RequestFail", 1)
                if useServerCertificate:
                    gConfigurationData.setOptionInCFG(
                        '/DIRAC/Security/UseServerCertificate', 'true')
                break

            # # operation status check
            if operation.Status == "Done" and pluginName:
                gMonitor.addMark("%s%s" % (pluginName, "OK"), 1)
            elif operation.Status == "Failed" and pluginName:
                gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1)
            elif operation.Status in ("Waiting", "Scheduled"):
                # # no update for waiting or all files scheduled
                break

        gMonitor.flush()

        if error:
            return S_ERROR(error)

        # # request done?
        if self.request.Status == "Done":
            # # update request to the RequestDB
            self.log.info("Updating request status:",
                          "%s" % self.request.Status)
            update = self.updateRequest()
            if not update["OK"]:
                self.log.error("Cannot update request status",
                               update["Message"])
                return update
            self.log.info("request is done", "%s" % self.request.RequestName)
            gMonitor.addMark("RequestOK", 1)
            # # and there is a job waiting for it? finalize!
            if self.request.JobID:
                attempts = 0
                while True:
                    finalizeRequest = self.requestClient.finalizeRequest(
                        self.request.RequestID, self.request.JobID)  # pylint: disable=no-member
                    if not finalizeRequest["OK"]:
                        if not attempts:
                            self.log.error(
                                "unable to finalize request, will retry",
                                "ReqName %s:%s" % (self.request.RequestName,
                                                   finalizeRequest["Message"]))
                        self.log.debug("Waiting 10 seconds")
                        attempts += 1
                        if attempts == 10:
                            self.log.error("Giving up finalize request")
                            return S_ERROR('Could not finalize request')

                        time.sleep(10)

                    else:
                        self.log.info(
                            "request is finalized",
                            "ReqName %s %s" % (self.request.RequestName,
                                               (' after %d attempts' %
                                                attempts) if attempts else ''))
                        break

        # Request will be updated by the callBack method
        self.log.verbose("RequestTasks exiting",
                         "request %s" % self.request.Status)
        return S_OK(self.request)
コード例 #20
0
 def export_commitRegisters(self, entriesList):
     acc = RPCClient('Accounting/DataStore')
     retVal = acc.commitRegisters(entriesList)
     return retVal
コード例 #21
0
ファイル: ProxyStorage.py プロジェクト: rob-c/DIRAC
 def getFileMetadata(self, path):
     client = RPCClient(self.url)
     return client.callProxyMethod(self.name, 'getFileMetadata', [path], {})
コード例 #22
0
 def __requestJob(self, ceDict):
     """Request a single job from the matcher service.
 """
     matcher = RPCClient('WorkloadManagement/Matcher', timeout=600)
     return matcher.requestJob(ceDict)
コード例 #23
0
ファイル: ProxyStorage.py プロジェクト: rob-c/DIRAC
 def pinFile(self, path, lifetime=60 * 60 * 24):
     client = RPCClient(self.url)
     return client.callProxyMethod(self.name, 'pinFile', [path],
                                   {'lifetime': lifetime})
コード例 #24
0
    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        queues = self.queueDict.keys()

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {
            'Setup': setup,
            'CPUTime': 9999999,
            'SubmitPool': self.defaultSubmitPools
        }
        if self.vo:
            tqDict['Community'] = self.vo
        if self.voGroups:
            tqDict['OwnerGroup'] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result['OK']:
            return result
        tqDict['Platform'] = result['Value']
        tqDict['Site'] = self.sites
        tags = []
        for queue in queues:
            tags += self.queueDict[queue]['ParametersDict']['Tag']
        tqDict['Tag'] = list(set(tags))

        self.log.verbose('Checking overall TQ availability with requirements')
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result['OK']:
            return result
        if not result['Value']:
            self.log.verbose('No Waiting jobs suitable for the director')
            return S_OK()

        jobSites = set()
        anySite = False
        testSites = set()
        totalWaitingJobs = 0
        for tqID in result['Value']:
            if "Sites" in result['Value'][tqID]:
                for site in result['Value'][tqID]['Sites']:
                    if site.lower() != 'any':
                        jobSites.add(site)
                    else:
                        anySite = True
            else:
                anySite = True
            if "JobTypes" in result['Value'][tqID]:
                if "Sites" in result['Value'][tqID]:
                    for site in result['Value'][tqID]['Sites']:
                        if site.lower() != 'any':
                            testSites.add(site)
            totalWaitingJobs += result['Value'][tqID]['Jobs']

        tqIDList = result['Value'].keys()
        self.log.info(tqIDList)
        result = pilotAgentsDB.countPilots(
            {
                'TaskQueueID': tqIDList,
                'Status': WAITING_PILOT_STATUS
            }, None)
        tagWaitingPilots = 0
        if result['OK']:
            tagWaitingPilots = result['Value']
        self.log.info(
            'Total %d jobs in %d task queues with %d waiting pilots' %
            (totalWaitingJobs, len(tqIDList), tagWaitingPilots))
        self.log.info('Queues: ', self.queueDict.keys())
        # if tagWaitingPilots >= totalWaitingJobs:
        #  self.log.info( 'No more pilots to be submitted in this cycle' )
        #  return S_OK()

        result = self.siteClient.getUsableSites()
        if not result['OK']:
            return result
        siteMaskList = result['Value']

        queues = self.queueDict.keys()
        random.shuffle(queues)
        totalSubmittedPilots = 0
        matchedQueues = 0
        for queue in queues:

            # Check if the queue failed previously
            failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor
            if failedCount != 0:
                self.log.warn("%s queue failed recently, skipping %d cycles" %
                              (queue, 10 - failedCount))
                self.failedQueues[queue] += 1
                continue

            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            ceType = self.queueDict[queue]['CEType']
            queueName = self.queueDict[queue]['QueueName']
            siteName = self.queueDict[queue]['Site']
            platform = self.queueDict[queue]['Platform']
            queueTags = self.queueDict[queue]['ParametersDict']['Tag']
            siteMask = siteName in siteMaskList
            processorTags = []

            # Check the status of the Site
            result = self.siteClient.getUsableSites(siteName)
            if not result['OK']:
                self.log.error("Can not get the status of site %s: %s" %
                               (siteName, result['Message']))
                continue
            if siteName not in result.get('Value', []):
                self.log.info("site %s is not active" % siteName)
                continue

            if self.rssFlag:
                # Check the status of the ComputingElement
                result = self.rssClient.getElementStatus(
                    ceName, "ComputingElement")
                if not result['OK']:
                    self.log.error(
                        "Can not get the status of computing element",
                        " %s: %s" % (siteName, result['Message']))
                    continue
                if result['Value']:
                    # get the value of the status
                    result = result['Value'][ceName]['all']

                if result not in ('Active', 'Degraded'):
                    self.log.verbose(
                        "Skipping computing element %s at %s: resource not usable"
                        % (ceName, siteName))
                    continue

            for tag in queueTags:
                if re.match(r'^[0-9]+Processors$', tag):
                    processorTags.append(tag)
            if 'WholeNode' in queueTags:
                processorTags.append('WholeNode')

            if not anySite and siteName not in jobSites:
                self.log.verbose(
                    "Skipping queue %s at %s: no workload expected" %
                    (queueName, siteName))
                continue
            if not siteMask and siteName not in testSites:
                self.log.verbose(
                    "Skipping queue %s at site %s not in the mask" %
                    (queueName, siteName))
                continue

            if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
                queueCPUTime = int(
                    self.queueDict[queue]['ParametersDict']['CPUTime'])
            else:
                self.log.warn(
                    'CPU time limit is not specified for queue %s, skipping...'
                    % queue)
                continue
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Prepare the queue description to look for eligible jobs
            ceDict = ce.getParameterDict()
            ceDict['GridCE'] = ceName
            # if not siteMask and 'Site' in ceDict:
            #  self.log.info( 'Site not in the mask %s' % siteName )
            #  self.log.info( 'Removing "Site" from matching Dict' )
            #  del ceDict[ 'Site' ]
            if not siteMask:
                ceDict['JobType'] = "Test"
            if self.vo:
                ceDict['Community'] = self.vo
            if self.voGroups:
                ceDict['OwnerGroup'] = self.voGroups

            # This is a hack to get rid of !
            ceDict['SubmitPool'] = self.defaultSubmitPools

            result = Resources.getCompatiblePlatforms(platform)
            if not result['OK']:
                continue
            ceDict['Platform'] = result['Value']

            ceDict['Tag'] = queueTags
            # Get the number of eligible jobs for the target site/queue
            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.verbose('No matching TQs found for %s' % queue)
                continue

            matchedQueues += 1
            totalTQJobs = 0
            totalTQJobsByProcessors = {}
            tqIDList = taskQueueDict.keys()
            tqIDListByProcessors = {}
            for tq in taskQueueDict:
                if 'Tags' not in taskQueueDict[tq]:
                    # skip non multiprocessor tqs
                    continue
                for tag in taskQueueDict[tq]['Tags']:
                    if tag in processorTags:
                        tqIDListByProcessors.setdefault(tag, [])
                        tqIDListByProcessors[tag].append(tq)

                        totalTQJobsByProcessors.setdefault(tag, 0)
                        totalTQJobsByProcessors[tag] += taskQueueDict[tq][
                            'Jobs']

                totalTQJobs += taskQueueDict[tq]['Jobs']

            self.log.verbose(
                '%d job(s) from %d task queue(s) are eligible for %s queue' %
                (totalTQJobs, len(tqIDList), queue))

            queueSubmittedPilots = 0
            for tag in tqIDListByProcessors:

                self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" %
                                 (tag, tqIDListByProcessors[tag]))

                processors = 1

                m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag)
                if m:
                    processors = int(m.group('processors'))
                if tag == 'WholeNode':
                    processors = -1

                tagTQJobs = totalTQJobsByProcessors[tag]
                tagTqIDList = tqIDListByProcessors[tag]

                # Get the number of already waiting pilots for these task queues
                tagWaitingPilots = 0
                if self.pilotWaitingFlag:
                    lastUpdateTime = dateTime(
                    ) - self.pilotWaitingTime * second
                    result = pilotAgentsDB.countPilots(
                        {
                            'TaskQueueID': tagTqIDList,
                            'Status': WAITING_PILOT_STATUS
                        }, None, lastUpdateTime)
                    if not result['OK']:
                        self.log.error(
                            'Failed to get Number of Waiting pilots',
                            result['Message'])
                        tagWaitingPilots = 0
                    else:
                        tagWaitingPilots = result['Value']
                        self.log.verbose(
                            'Waiting Pilots for TaskQueue %s:' % tagTqIDList,
                            tagWaitingPilots)
                if tagWaitingPilots >= tagTQJobs:
                    self.log.verbose(
                        "%d waiting pilots already for all the available jobs"
                        % tagWaitingPilots)
                    continue

                self.log.verbose(
                    "%d waiting pilots for the total of %d eligible jobs for %s"
                    % (tagWaitingPilots, tagTQJobs, queue))

                # Get the working proxy
                cpuTime = queueCPUTime + 86400
                self.log.verbose("Getting pilot proxy for %s/%s %d long" %
                                 (self.pilotDN, self.pilotGroup, cpuTime))
                result = gProxyManager.getPilotProxyFromDIRACGroup(
                    self.pilotDN, self.pilotGroup, cpuTime)
                if not result['OK']:
                    return result
                self.proxy = result['Value']
                ce.setProxy(self.proxy, cpuTime - 60)

                # Get the number of available slots on the target site/queue
                totalSlots = self.getQueueSlots(queue, False)
                if totalSlots == 0:
                    self.log.debug('%s: No slots available' % queue)
                    continue

                # Note: comparing slots to job numbers is not accurate in multiprocessor case.
                #       This could lead to over submission.
                pilotsToSubmit = max(
                    0, min(totalSlots, tagTQJobs - tagWaitingPilots))
                self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \
                               ( queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit ) )

                # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
                pilotsToSubmit = min(
                    self.maxPilotsToSubmit - queueSubmittedPilots,
                    pilotsToSubmit)

                while pilotsToSubmit > 0:
                    self.log.info('Going to submit %d pilots to %s queue' %
                                  (pilotsToSubmit, queue))

                    bundleProxy = self.queueDict[queue].get(
                        'BundleProxy', False)
                    jobExecDir = ''
                    jobExecDir = self.queueDict[queue]['ParametersDict'].get(
                        'JobExecDir', jobExecDir)
                    httpProxy = self.queueDict[queue]['ParametersDict'].get(
                        'HttpProxy', '')

                    result = self.getExecutable(queue,
                                                pilotsToSubmit,
                                                bundleProxy,
                                                httpProxy,
                                                jobExecDir,
                                                processors=processors)
                    if not result['OK']:
                        return result

                    executable, pilotSubmissionChunk = result['Value']
                    result = ce.submitJob(executable,
                                          '',
                                          pilotSubmissionChunk,
                                          processors=processors)
                    # ## FIXME: The condor thing only transfers the file with some
                    # ## delay, so when we unlink here the script is gone
                    # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts...
                    if ceType != 'HTCondorCE':
                        os.unlink(executable)
                    if not result['OK']:
                        self.log.error(
                            'Failed submission to queue %s:\n' % queue,
                            result['Message'])
                        pilotsToSubmit = 0
                        self.failedQueues[queue] += 1
                        continue

                    pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
                    queueSubmittedPilots += pilotSubmissionChunk
                    # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                    # task queue priorities
                    pilotList = result['Value']
                    self.queueSlots[queue]['AvailableSlots'] -= len(pilotList)
                    totalSubmittedPilots += len(pilotList)
                    self.log.info('Submitted %d pilots to %s@%s' %
                                  (len(pilotList), queueName, ceName))
                    stampDict = {}
                    if result.has_key('PilotStampDict'):
                        stampDict = result['PilotStampDict']
                    tqPriorityList = []
                    sumPriority = 0.
                    for tq in tagTqIDList:
                        sumPriority += taskQueueDict[tq]['Priority']
                        tqPriorityList.append((tq, sumPriority))
                    rndm = random.random() * sumPriority
                    tqDict = {}
                    for pilotID in pilotList:
                        rndm = random.random() * sumPriority
                        for tq, prio in tqPriorityList:
                            if rndm < prio:
                                tqID = tq
                                break
                        if not tqDict.has_key(tqID):
                            tqDict[tqID] = []
                        tqDict[tqID].append(pilotID)

                    for tqID, pilotList in tqDict.items():
                        result = pilotAgentsDB.addPilotTQReference(
                            pilotList, tqID, self.pilotDN, self.pilotGroup,
                            self.localhost, ceType, '', stampDict)
                        if not result['OK']:
                            self.log.error(
                                'Failed add pilots to the PilotAgentsDB: ',
                                result['Message'])
                            continue
                        for pilot in pilotList:
                            result = pilotAgentsDB.setPilotStatus(
                                pilot, 'Submitted', ceName,
                                'Successfully submitted by the SiteDirector',
                                siteName, queueName)
                            if not result['OK']:
                                self.log.error('Failed to set pilot status: ',
                                               result['Message'])
                                continue

        self.log.info(
            "%d pilots submitted in total in this cycle, %d matched queues" %
            (totalSubmittedPilots, matchedQueues))
        return S_OK()
コード例 #25
0
ファイル: ProxyStorage.py プロジェクト: rob-c/DIRAC
 def getDirectorySize(self, path):
     client = RPCClient(self.url)
     return client.callProxyMethod(self.name, 'getDirectorySize', [path],
                                   {})
コード例 #26
0
ファイル: SiteMapClient.py プロジェクト: ptakha/DIRAC-1
 def __getRPCClient(self):
     if self.getRPCClient:
         return self.getRPCClient("Framework/SiteMap")
     return RPCClient("Framework/SiteMap")
コード例 #27
0
ファイル: ProxyStorage.py プロジェクト: rob-c/DIRAC
 def removeDirectory(self, path, recursive=False):
     client = RPCClient(self.url)
     return client.callProxyMethod(self.name, 'removeDirectory', [path],
                                   {'recursive': recursive})
コード例 #28
0
ファイル: DiracAdmin.py プロジェクト: zimmerst/DIRAC
                jobID = int(jobID)
            except Exception, x:
                return self._errorReport(
                    str(x),
                    'Expected integer or convertible integer for existing jobID'
                )
        elif type(jobID) == type([]):
            try:
                jobID = [int(job) for job in jobID]
            except Exception, x:
                return self._errorReport(
                    str(x),
                    'Expected integer or convertible integer for existing jobIDs'
                )

        jobManager = RPCClient('WorkloadManagement/JobManager',
                               useCertificates=False)
        result = jobManager.resetJob(jobID)
        return result

    #############################################################################
    def getJobPilotOutput(self, jobID, directory=''):
        """Retrieve the pilot output for an existing job in the WMS.
       The output will be retrieved in a local directory unless
       otherwise specified.

       >>> print dirac.getJobPilotOutput(12345)
       {'OK': True, StdOut:'',StdError:''}

       @param job: JobID
       @type job: integer or string
       @return: S_OK,S_ERROR
コード例 #29
0
 def __getRPCClient(self):
     """ Get an RPC client for SB service """
     if self.__rpcClient:
         return self.__rpcClient
     else:
         return RPCClient(self.__serviceName, **self.__kwargs)
コード例 #30
0
 def requestManager(cls):
     """ get request manager """
     if not cls.__requestManager:
         cls.__requestManager = RPCClient("RequestManagement/ReqManager")
     return cls.__requestManager