Beispiel #1
0
    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {
            'Setup': setup,
            'CPUTime': 9999999,
            'SubmitPool': self.defaultSubmitPools
        }
        if self.vo:
            tqDict['Community'] = self.vo
        if self.voGroups:
            tqDict['OwnerGroup'] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result['OK']:
            return result
        tqDict['Platform'] = result['Value']
        tqDict['Site'] = self.sites

        self.log.verbose('Checking overall TQ availability with requirements')
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result['OK']:
            return result
        if not result['Value']:
            self.log.verbose('No Waiting jobs suitable for the director')
            return S_OK()

        # Check if the site is allowed in the mask
        result = jobDB.getSiteMask()
        if not result['OK']:
            return S_ERROR('Can not get the site mask')
        siteMaskList = result['Value']

        queues = self.queueDict.keys()
        random.shuffle(queues)
        for queue in queues:
            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            ceType = self.queueDict[queue]['CEType']
            queueName = self.queueDict[queue]['QueueName']
            siteName = self.queueDict[queue]['Site']
            siteMask = siteName in siteMaskList

            if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
                queueCPUTime = int(
                    self.queueDict[queue]['ParametersDict']['CPUTime'])
            else:
                self.log.warn(
                    'CPU time limit is not specified for queue %s, skipping...'
                    % queue)
                continue
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Get the working proxy
            cpuTime = queueCPUTime + 86400

            self.log.verbose("Getting pilot proxy for %s/%s %d long" %
                             (self.pilotDN, self.pilotGroup, cpuTime))
            result = gProxyManager.getPilotProxyFromDIRACGroup(
                self.pilotDN, self.pilotGroup, cpuTime)
            if not result['OK']:
                return result
            self.proxy = result['Value']
            ce.setProxy(self.proxy, cpuTime - 60)

            # Get the number of available slots on the target site/queue
            result = ce.available()
            if not result['OK']:
                self.log.warn(
                    'Failed to check the availability of queue %s: \n%s' %
                    (queue, result['Message']))
                continue
            ceInfoDict = result['CEInfoDict']
            self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \
                           ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'],
                             ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) )

            totalSlots = result['Value']

            ceDict = ce.getParameterDict()
            ceDict['GridCE'] = ceName
            if not siteMask and 'Site' in ceDict:
                self.log.info('Site not in the mask %s' % siteName)
                self.log.info('Removing "Site" from matching Dict')
                del ceDict['Site']
            if self.vo:
                ceDict['Community'] = self.vo
            if self.voGroups:
                ceDict['OwnerGroup'] = self.voGroups

            # This is a hack to get rid of !
            ceDict['SubmitPool'] = self.defaultSubmitPools

            result = Resources.getCompatiblePlatforms(self.platforms)
            if not result['OK']:
                continue
            ceDict['Platform'] = result['Value']

            # Get the number of eligible jobs for the target site/queue
            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.info('No matching TQs found')
                continue

            totalTQJobs = 0
            tqIDList = taskQueueDict.keys()
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]['Jobs']

            pilotsToSubmit = min(totalSlots, totalTQJobs)

            # Get the number of already waiting pilots for this queue
            totalWaitingPilots = 0
            if self.pilotWaitingFlag:
                lastUpdateTime = dateTime() - self.pilotWaitingTime * second
                result = pilotAgentsDB.countPilots(
                    {
                        'TaskQueueID': tqIDList,
                        'Status': WAITING_PILOT_STATUS
                    }, None, lastUpdateTime)
                if not result['OK']:
                    self.log.error('Failed to get Number of Waiting pilots',
                                   result['Message'])
                    totalWaitingPilots = 0
                else:
                    totalWaitingPilots = result['Value']
                    self.log.verbose(
                        'Waiting Pilots for TaskQueue %s:' % tqIDList,
                        totalWaitingPilots)

            pilotsToSubmit = max(
                0, min(totalSlots, totalTQJobs - totalWaitingPilots))
            self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \
                                    ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) )

            # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
            pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit)

            while pilotsToSubmit > 0:
                self.log.info('Going to submit %d pilots to %s queue' %
                              (pilotsToSubmit, queue))

                bundleProxy = self.queueDict[queue].get('BundleProxy', False)
                jobExecDir = ''
                if ceType == 'CREAM':
                    jobExecDir = '.'
                jobExecDir = self.queueDict[queue].get('JobExecDir',
                                                       jobExecDir)
                httpProxy = self.queueDict[queue].get('HttpProxy', '')

                result = self.__getExecutable(queue, pilotsToSubmit,
                                              bundleProxy, httpProxy,
                                              jobExecDir)
                if not result['OK']:
                    return result

                executable, pilotSubmissionChunk = result['Value']
                result = ce.submitJob(executable, '', pilotSubmissionChunk)
                if not result['OK']:
                    self.log.error('Failed submission to queue %s:\n' % queue,
                                   result['Message'])
                    pilotsToSubmit = 0
                    continue

                pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
                # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                # task queue priorities
                pilotList = result['Value']
                self.log.info('Submitted %d pilots to %s@%s' %
                              (len(pilotList), queueName, ceName))
                stampDict = {}
                if result.has_key('PilotStampDict'):
                    stampDict = result['PilotStampDict']
                tqPriorityList = []
                sumPriority = 0.
                for tq in taskQueueDict:
                    sumPriority += taskQueueDict[tq]['Priority']
                    tqPriorityList.append((tq, sumPriority))
                rndm = random.random() * sumPriority
                tqDict = {}
                for pilotID in pilotList:
                    rndm = random.random() * sumPriority
                    for tq, prio in tqPriorityList:
                        if rndm < prio:
                            tqID = tq
                            break
                    if not tqDict.has_key(tqID):
                        tqDict[tqID] = []
                    tqDict[tqID].append(pilotID)

                for tqID, pilotList in tqDict.items():
                    result = pilotAgentsDB.addPilotTQReference(
                        pilotList, tqID, self.pilotDN, self.pilotGroup,
                        self.localhost, ceType, '', stampDict)
                    if not result['OK']:
                        self.log.error(
                            'Failed add pilots to the PilotAgentsDB: ',
                            result['Message'])
                        continue
                    for pilot in pilotList:
                        result = pilotAgentsDB.setPilotStatus(
                            pilot, 'Submitted', ceName,
                            'Successfully submitted by the SiteDirector',
                            siteName, queueName)
                        if not result['OK']:
                            self.log.error('Failed to set pilot status: ',
                                           result['Message'])
                            continue

        return S_OK()
Beispiel #2
0
    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        queues = self.queueDict.keys()

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {
            'Setup': setup,
            'CPUTime': 9999999,
            'SubmitPool': self.defaultSubmitPools
        }
        if self.vo:
            tqDict['Community'] = self.vo
        if self.voGroups:
            tqDict['OwnerGroup'] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result['OK']:
            return result
        tqDict['Platform'] = result['Value']
        tqDict['Site'] = self.sites
        tags = []
        for queue in queues:
            tags += self.queueDict[queue]['ParametersDict']['Tags']
        tqDict['Tag'] = list(set(tags))

        self.log.verbose('Checking overall TQ availability with requirements')
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result['OK']:
            return result
        if not result['Value']:
            self.log.verbose('No Waiting jobs suitable for the director')
            return S_OK()

        jobSites = set()
        anySite = False
        testSites = set()
        totalWaitingJobs = 0
        for tqID in result['Value']:
            if "Sites" in result['Value'][tqID]:
                for site in result['Value'][tqID]['Sites']:
                    if site.lower() != 'any':
                        jobSites.add(site)
                    else:
                        anySite = True
            else:
                anySite = True
            if "JobTypes" in result['Value'][tqID]:
                if "Sites" in result['Value'][tqID]:
                    for site in result['Value'][tqID]['Sites']:
                        if site.lower() != 'any':
                            testSites.add(site)
            totalWaitingJobs += result['Value'][tqID]['Jobs']

        tqIDList = result['Value'].keys()
        self.log.info(tqIDList)
        result = pilotAgentsDB.countPilots(
            {
                'TaskQueueID': tqIDList,
                'Status': WAITING_PILOT_STATUS
            }, None)
        tagWaitingPilots = 0
        if result['OK']:
            tagWaitingPilots = result['Value']
        self.log.info(
            'Total %d jobs in %d task queues with %d waiting pilots' %
            (totalWaitingJobs, len(tqIDList), tagWaitingPilots))
        self.log.info('Queues: ', self.queueDict.keys())
        # if tagWaitingPilots >= totalWaitingJobs:
        #  self.log.info( 'No more pilots to be submitted in this cycle' )
        #  return S_OK()

        # Check if the site is allowed in the mask
        result = jobDB.getSiteMask()
        if not result['OK']:
            return S_ERROR('Can not get the site mask')
        siteMaskList = result['Value']

        random.shuffle(queues)
        totalSubmittedPilots = 0
        matchedQueues = 0
        for queue in queues:

            # Check if the queue failed previously
            failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor
            if failedCount != 0:
                self.log.warn("%s queue failed recently, skipping %d cycles" %
                              (queue, 10 - failedCount))
                self.failedQueues[queue] += 1
                continue

            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            ceType = self.queueDict[queue]['CEType']
            queueName = self.queueDict[queue]['QueueName']
            siteName = self.queueDict[queue]['Site']
            platform = self.queueDict[queue]['Platform']
            queueTags = self.queueDict[queue]['ParametersDict']['Tags']
            siteMask = siteName in siteMaskList
            processorTags = []

            for tag in queueTags:
                if re.match(r'^[0-9]+Processors$', tag):
                    processorTags.append(tag)
            if 'WholeNode' in queueTags:
                processorTags.append('WholeNode')

            if not anySite and siteName not in jobSites:
                self.log.verbose(
                    "Skipping queue %s at %s: no workload expected" %
                    (queueName, siteName))
                continue
            if not siteMask and siteName not in testSites:
                self.log.verbose(
                    "Skipping queue %s at site %s not in the mask" %
                    (queueName, siteName))
                continue

            if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
                queueCPUTime = int(
                    self.queueDict[queue]['ParametersDict']['CPUTime'])
            else:
                self.log.warn(
                    'CPU time limit is not specified for queue %s, skipping...'
                    % queue)
                continue
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Prepare the queue description to look for eligible jobs
            ceDict = ce.getParameterDict()
            ceDict['GridCE'] = ceName
            # if not siteMask and 'Site' in ceDict:
            #  self.log.info( 'Site not in the mask %s' % siteName )
            #  self.log.info( 'Removing "Site" from matching Dict' )
            #  del ceDict[ 'Site' ]
            if not siteMask:
                ceDict['JobType'] = "Test"
            if self.vo:
                ceDict['Community'] = self.vo
            if self.voGroups:
                ceDict['OwnerGroup'] = self.voGroups

            # This is a hack to get rid of !
            ceDict['SubmitPool'] = self.defaultSubmitPools

            result = Resources.getCompatiblePlatforms(platform)
            if not result['OK']:
                continue
            ceDict['Platform'] = result['Value']

            ceDict['Tag'] = processorTags
            # Get the number of eligible jobs for the target site/queue
            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.verbose('No matching TQs found for %s' % queue)
                continue

            matchedQueues += 1
            totalTQJobs = 0
            totalTQJobsByProcessors = {}
            tqIDList = taskQueueDict.keys()
            tqIDListByProcessors = {}
            for tq in taskQueueDict:
                if 'Tags' not in taskQueueDict[tq]:
                    # skip non multiprocessor tqs
                    continue
                for tag in taskQueueDict[tq]['Tags']:
                    if tag in processorTags:
                        tqIDListByProcessors.setdefault(tag, [])
                        tqIDListByProcessors[tag].append(tq)

                        totalTQJobsByProcessors.setdefault(tag, 0)
                        totalTQJobsByProcessors[tag] += taskQueueDict[tq][
                            'Jobs']

                totalTQJobs += taskQueueDict[tq]['Jobs']

            self.log.verbose(
                '%d job(s) from %d task queue(s) are eligible for %s queue' %
                (totalTQJobs, len(tqIDList), queue))

            queueSubmittedPilots = 0
            for tag in tqIDListByProcessors.keys():

                self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" %
                                 (tag, tqIDListByProcessors[tag]))

                processors = 1

                m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag)
                if m:
                    processors = int(m.group('processors'))
                if tag == 'WholeNode':
                    processors = -1

                tagTQJobs = totalTQJobsByProcessors[tag]
                tagTqIDList = tqIDListByProcessors[tag]

                # Get the number of already waiting pilots for these task queues
                tagWaitingPilots = 0
                if self.pilotWaitingFlag:
                    lastUpdateTime = dateTime(
                    ) - self.pilotWaitingTime * second
                    result = pilotAgentsDB.countPilots(
                        {
                            'TaskQueueID': tagTqIDList,
                            'Status': WAITING_PILOT_STATUS
                        }, None, lastUpdateTime)
                    if not result['OK']:
                        self.log.error(
                            'Failed to get Number of Waiting pilots',
                            result['Message'])
                        tagWaitingPilots = 0
                    else:
                        tagWaitingPilots = result['Value']
                        self.log.verbose(
                            'Waiting Pilots for TaskQueue %s:' % tagTqIDList,
                            tagWaitingPilots)
                if tagWaitingPilots >= tagTQJobs:
                    self.log.verbose(
                        "%d waiting pilots already for all the available jobs"
                        % tagWaitingPilots)
                    continue

                self.log.verbose(
                    "%d waiting pilots for the total of %d eligible jobs for %s"
                    % (tagWaitingPilots, tagTQJobs, queue))

                # Get the working proxy
                cpuTime = queueCPUTime + 86400
                self.log.verbose("Getting pilot proxy for %s/%s %d long" %
                                 (self.pilotDN, self.pilotGroup, cpuTime))
                result = gProxyManager.getPilotProxyFromDIRACGroup(
                    self.pilotDN, self.pilotGroup, cpuTime)
                if not result['OK']:
                    return result
                self.proxy = result['Value']
                ce.setProxy(self.proxy, cpuTime - 60)

                # Get the number of available slots on the target site/queue
                totalSlots = self.getQueueSlots(queue, False)
                if totalSlots == 0:
                    self.log.debug('%s: No slots available' % queue)
                    continue

                # Note: comparing slots to job numbers is not accurate in multiprocessor case.
                #       This could lead to over submission.
                pilotsToSubmit = max(
                    0, min(totalSlots, tagTQJobs - tagWaitingPilots))
                self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \
                               ( queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit ) )

                # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
                pilotsToSubmit = min(
                    self.maxPilotsToSubmit - queueSubmittedPilots,
                    pilotsToSubmit)

                while pilotsToSubmit > 0:
                    self.log.info('Going to submit %d pilots to %s queue' %
                                  (pilotsToSubmit, queue))

                    bundleProxy = self.queueDict[queue].get(
                        'BundleProxy', False)
                    jobExecDir = ''
                    jobExecDir = self.queueDict[queue]['ParametersDict'].get(
                        'JobExecDir', jobExecDir)
                    httpProxy = self.queueDict[queue]['ParametersDict'].get(
                        'HttpProxy', '')

                    result = self.getExecutable(queue, pilotsToSubmit,
                                                bundleProxy, httpProxy,
                                                jobExecDir, processors)
                    if not result['OK']:
                        return result

                    executable, pilotSubmissionChunk = result['Value']
                    result = ce.submitJob(executable,
                                          '',
                                          pilotSubmissionChunk,
                                          processors=processors)
                    # ## FIXME: The condor thing only transfers the file with some
                    # ## delay, so when we unlink here the script is gone
                    # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts...
                    if ceType != 'HTCondorCE':
                        os.unlink(executable)
                    if not result['OK']:
                        self.log.error(
                            'Failed submission to queue %s:\n' % queue,
                            result['Message'])
                        pilotsToSubmit = 0
                        self.failedQueues[queue] += 1
                        continue

                    pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
                    queueSubmittedPilots += pilotSubmissionChunk
                    # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                    # task queue priorities
                    pilotList = result['Value']
                    self.queueSlots[queue]['AvailableSlots'] -= len(pilotList)
                    totalSubmittedPilots += len(pilotList)
                    self.log.info('Submitted %d pilots to %s@%s' %
                                  (len(pilotList), queueName, ceName))
                    stampDict = {}
                    if result.has_key('PilotStampDict'):
                        stampDict = result['PilotStampDict']
                    tqPriorityList = []
                    sumPriority = 0.
                    for tq in tagTqIDList:
                        sumPriority += taskQueueDict[tq]['Priority']
                        tqPriorityList.append((tq, sumPriority))
                    rndm = random.random() * sumPriority
                    tqDict = {}
                    for pilotID in pilotList:
                        rndm = random.random() * sumPriority
                        for tq, prio in tqPriorityList:
                            if rndm < prio:
                                tqID = tq
                                break
                        if not tqDict.has_key(tqID):
                            tqDict[tqID] = []
                        tqDict[tqID].append(pilotID)

                    for tqID, pilotList in tqDict.items():
                        result = pilotAgentsDB.addPilotTQReference(
                            pilotList, tqID, self.pilotDN, self.pilotGroup,
                            self.localhost, ceType, '', stampDict)
                        if not result['OK']:
                            self.log.error(
                                'Failed add pilots to the PilotAgentsDB: ',
                                result['Message'])
                            continue
                        for pilot in pilotList:
                            result = pilotAgentsDB.setPilotStatus(
                                pilot, 'Submitted', ceName,
                                'Successfully submitted by the SiteDirector',
                                siteName, queueName)
                            if not result['OK']:
                                self.log.error('Failed to set pilot status: ',
                                               result['Message'])
                                continue

        self.log.info(
            "%d pilots submitted in total in this cycle, %d matched queues" %
            (totalSubmittedPilots, matchedQueues))
        return S_OK()
Beispiel #3
0
    def _submitPilots(self, workDir, taskQueueDict, pilotOptions,
                      pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ,
                      proxy, pilotsPerJob):
        """
      This method does the actual pilot submission to the Grid RB
      The logic is as follows:
      - If there are no available RB it return error
      - If there is no VOMS extension in the proxy, return error
      - It creates a temp directory
      - Prepare a JDL
        it has some part common to gLite and LCG (the payload description)
        it has some part specific to each middleware
    """
        taskQueueID = taskQueueDict['TaskQueueID']
        # ownerDN = taskQueueDict['OwnerDN']
        credDict = proxy.getCredentials()['Value']
        ownerDN = credDict['identity']
        ownerGroup = credDict['group']

        if not self.resourceBrokers:
            # Since we can exclude RBs from the list, it may become empty
            return S_ERROR(ERROR_RB)

        # Need to get VOMS extension for the later interactions with WMS
        ret = gProxyManager.getVOMSAttributes(proxy)
        if not ret['OK']:
            self.log.error(ERROR_VOMS, ret['Message'])
            return S_ERROR(ERROR_VOMS)
        if not ret['Value']:
            return S_ERROR(ERROR_VOMS)
        vomsGroup = ret['Value'][0]

        workingDirectory = tempfile.mkdtemp(prefix='TQ_%s_' % taskQueueID,
                                            dir=workDir)
        self.log.verbose('Using working Directory:', workingDirectory)

        # Write JDL
        retDict = self._prepareJDL(taskQueueDict, workingDirectory,
                                   pilotOptions, pilotsPerJob, ceMask,
                                   submitPrivatePilot, privateTQ)
        jdl = retDict['JDL']
        pilotRequirements = retDict['Requirements']
        rb = retDict['RB']
        if not jdl:
            try:
                shutil.rmtree(workingDirectory)
            except:
                pass
            return S_ERROR(ERROR_JDL)

        # Check that there are available queues for the Job:
        if self.enableListMatch:
            availableCEs = []
            now = Time.dateTime()
            availableCEs = self.listMatchCache.get(pilotRequirements)
            if availableCEs == False:
                availableCEs = self._listMatch(proxy, jdl, taskQueueID, rb)
                if availableCEs != False:
                    self.log.verbose('LastListMatch', now)
                    self.log.verbose('AvailableCEs ', availableCEs)
                    self.listMatchCache.add(
                        pilotRequirements,
                        self.listMatchDelay * 60,
                        value=availableCEs)  # it is given in minutes
            if not availableCEs:
                try:
                    shutil.rmtree(workingDirectory)
                except:
                    pass
                return S_ERROR(ERROR_CE + ' TQ: %d' % taskQueueID)

        # Now we are ready for the actual submission, so

        self.log.verbose('Submitting Pilots for TaskQueue', taskQueueID)
        submitRet = self._submitPilot(proxy, pilotsPerJob, jdl, taskQueueID,
                                      rb)
        try:
            shutil.rmtree(workingDirectory)
        except:
            pass
        if not submitRet:
            return S_ERROR('Pilot Submission Failed for TQ %d ' % taskQueueID)
        # pilotReference, resourceBroker = submitRet

        submittedPilots = 0

        if pilotsPerJob != 1 and len(submitRet) != pilotsPerJob:
            # Parametric jobs are used
            for pilotReference, resourceBroker in submitRet:
                pilotReference = self._getChildrenReferences(
                    proxy, pilotReference, taskQueueID)
                submittedPilots += len(pilotReference)
                pilotAgentsDB.addPilotTQReference(pilotReference, taskQueueID,
                                                  ownerDN, ownerGroup,
                                                  resourceBroker,
                                                  self.gridMiddleware,
                                                  pilotRequirements)
        else:
            for pilotReference, resourceBroker in submitRet:
                pilotReference = [pilotReference]
                submittedPilots += len(pilotReference)
                pilotAgentsDB.addPilotTQReference(pilotReference, taskQueueID,
                                                  ownerDN, ownerGroup,
                                                  resourceBroker,
                                                  self.gridMiddleware,
                                                  pilotRequirements)

        # add some sleep here
        time.sleep(0.1 * submittedPilots)

        if pilotsToSubmit > pilotsPerJob:
            # Additional submissions are necessary, need to get a new token and iterate.
            pilotsToSubmit -= pilotsPerJob
            result = gProxyManager.requestToken(
                ownerDN, ownerGroup, max(pilotsToSubmit,
                                         self.maxJobsInFillMode))
            if not result['OK']:
                self.log.error(ERROR_TOKEN, result['Message'])
                result = S_ERROR(ERROR_TOKEN)
                result['Value'] = submittedPilots
                return result
            (token, numberOfUses) = result['Value']
            for option in pilotOptions:
                if option.find('-o /Security/ProxyToken=') == 0:
                    pilotOptions.remove(option)
            pilotOptions.append('-o /Security/ProxyToken=%s' % token)
            pilotsPerJob = max(
                1, min(pilotsPerJob,
                       int(numberOfUses / self.maxJobsInFillMode)))
            result = self._submitPilots(workDir, taskQueueDict, pilotOptions,
                                        pilotsToSubmit, ceMask,
                                        submitPrivatePilot, privateTQ, proxy,
                                        pilotsPerJob)
            if not result['OK']:
                if 'Value' not in result:
                    result['Value'] = 0
                result['Value'] += submittedPilots
                return result
            submittedPilots += result['Value']

        return S_OK(submittedPilots)
Beispiel #4
0
    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        # Check if the site is allowed in the mask
        result = jobDB.getSiteMask()
        if not result['OK']:
            return S_ERROR('Can not get the site mask')
        siteMaskList = result['Value']

        for queue in self.queueDict:
            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            ceType = self.queueDict[queue]['CEType']
            queueName = self.queueDict[queue]['QueueName']
            siteName = self.queueDict[queue]['Site']
            siteMask = siteName in siteMaskList

            if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
                queueCPUTime = int(
                    self.queueDict[queue]['ParametersDict']['CPUTime'])
            else:
                return S_ERROR('CPU time limit is not specified for queue %s' %
                               queue)
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Get the working proxy
            cpuTime = queueCPUTime + 86400
            result = gProxyManager.getPilotProxyFromDIRACGroup(
                self.genericPilotDN, self.genericPilotGroup, cpuTime)
            if not result['OK']:
                return result
            self.proxy = result['Value']
            ce.setProxy(self.proxy, cpuTime - 60)

            result = ce.available()
            if not result['OK']:
                self.log.warn(
                    'Failed to check the availability of queue %s: %s' %
                    (queue, result['Message']))
                continue

            totalSlots = result['Value']

            self.log.verbose(result['Message'])

            ceDict = ce.getParameterDict()
            ceDict['GridCE'] = ceName
            if not siteMask and 'Site' in ceDict:
                self.log.info('Site not in the mask %s' % siteName)
                self.log.info('Removing "Site" from matching Dict')
                del ceDict['Site']

            result = taskQueueDB.getMatchingTaskQueues(ceDict)

            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.verbose('No matching TQs found')
                continue

            totalTQJobs = 0
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]['Jobs']

            pilotsToSubmit = min(totalSlots, totalTQJobs)
            self.log.verbose(
                'Available slots=%d, TQ jobs=%d, Pilots to submit=%d' %
                (totalSlots, totalTQJobs, pilotsToSubmit))

            if pilotsToSubmit > 0:
                self.log.info('Going to submit %d pilots to %s queue' %
                              (pilotsToSubmit, queue))

                bundleProxy = self.queueDict[queue].get('BundleProxy', False)
                result = self.__getExecutable(queue, pilotsToSubmit,
                                              bundleProxy)
                if not result['OK']:
                    return result

                executable = result['Value']
                result = ce.submitJob(executable, '', pilotsToSubmit)
                if not result['OK']:
                    self.log.error('Failed submission to queue %s:' % queue,
                                   result['Message'])
                    continue
                # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                # task queue priorities
                pilotList = result['Value']
                stampDict = {}
                if result.has_key('PilotStampDict'):
                    stampDict = result['PilotStampDict']
                tqPriorityList = []
                sumPriority = 0.
                for tq in taskQueueDict:
                    sumPriority += taskQueueDict[tq]['Priority']
                    tqPriorityList.append((tq, sumPriority))
                rndm = random.random() * sumPriority
                tqDict = {}
                for pilotID in pilotList:
                    rndm = random.random() * sumPriority
                    for tq, prio in tqPriorityList:
                        if rndm < prio:
                            tqID = tq
                            break
                    if not tqDict.has_key(tqID):
                        tqDict[tqID] = []
                    tqDict[tqID].append(pilotID)

                for tqID, pilotList in tqDict.items():
                    result = pilotAgentsDB.addPilotTQReference(
                        pilotList, tqID, self.genericPilotDN,
                        self.genericPilotGroup, self.localhost, ceType, '',
                        stampDict)
                    if not result['OK']:
                        self.log.error(
                            'Failed add pilots to the PilotAgentsDB: %s' %
                            result['Message'])
                        continue
                    for pilot in pilotList:
                        result = pilotAgentsDB.setPilotStatus(
                            pilot, 'Submitted', ceName,
                            'Successfuly submitted by the SiteDirector',
                            siteName, queueName)
                        if not result['OK']:
                            self.log.error('Failed to set pilot status: %s' %
                                           result['Message'])
                            continue

        return S_OK()
Beispiel #5
0
    def createVMs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        vmTypeList = self.vmTypeDict.keys()

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {'Setup': setup, 'CPUTime': 9999999}
        if self.vo:
            tqDict['VO'] = self.vo
        if self.voGroups:
            tqDict['OwnerGroup'] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result['OK']:
            return result
        tqDict['Platform'] = result['Value']
        tqDict['Site'] = self.sites
        tags = []
        for vmType in vmTypeList:
            if 'Tag' in self.vmTypeDict[vmType]['ParametersDict']:
                tags += self.vmTypeDict[vmType]['ParametersDict']['Tag']
        tqDict['Tag'] = list(set(tags))
        tqDict['SubmitPool'] = "wenmrPool"

        self.log.verbose('Checking overall TQ availability with requirements')
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result['OK']:
            return result
        if not result['Value']:
            self.log.verbose('No Waiting jobs suitable for the director')
            return S_OK()

        jobSites = set()
        anySite = False
        testSites = set()
        totalWaitingJobs = 0
        for tqID in result['Value']:
            if "Sites" in result['Value'][tqID]:
                for site in result['Value'][tqID]['Sites']:
                    if site.lower() != 'any':
                        jobSites.add(site)
                    else:
                        anySite = True
            else:
                anySite = True
            if "JobTypes" in result['Value'][tqID]:
                if "Sites" in result['Value'][tqID]:
                    for site in result['Value'][tqID]['Sites']:
                        if site.lower() != 'any':
                            testSites.add(site)
            totalWaitingJobs += result['Value'][tqID]['Jobs']

        tqIDList = result['Value'].keys()

        result = virtualMachineDB.getInstanceCounters('Status', {})
        totalVMs = 0
        if result['OK']:
            for status in result['Value']:
                if status in ['New', 'Submitted', 'Running']:
                    totalVMs += result['Value'][status]
        self.log.info('Total %d jobs in %d task queues with %d VMs' %
                      (totalWaitingJobs, len(tqIDList), totalVMs))

        # Check if the site is allowed in the mask
        result = self.wmsClient.getSiteMask()
        if not result['OK']:
            return S_ERROR('Can not get the site mask')
        siteMaskList = result['Value']

        vmTypeList = self.vmTypeDict.keys()
        random.shuffle(vmTypeList)
        totalSubmittedPilots = 0
        matchedQueues = 0
        for vmType in vmTypeList:
            ce = self.vmTypeDict[vmType]['CE']
            ceName = self.vmTypeDict[vmType]['CEName']
            vmTypeName = self.vmTypeDict[vmType]['VMType']
            siteName = self.vmTypeDict[vmType]['Site']
            platform = self.vmTypeDict[vmType]['Platform']
            vmTypeTags = self.vmTypeDict[vmType]['ParametersDict'].get(
                'Tag', [])
            siteMask = siteName in siteMaskList
            endpoint = "%s::%s" % (siteName, ceName)
            maxInstances = int(self.vmTypeDict[vmType]['MaxInstances'])
            processorTags = []

            # vms support WholeNode naturally
            processorTags.append('WholeNode')

            if not anySite and siteName not in jobSites:
                self.log.verbose(
                    "Skipping queue %s at %s: no workload expected" %
                    (vmTypeName, siteName))
                continue
            if not siteMask and siteName not in testSites:
                self.log.verbose("Skipping queue %s: site %s not in the mask" %
                                 (vmTypeName, siteName))
                continue

            if 'CPUTime' in self.vmTypeDict[vmType]['ParametersDict']:
                vmTypeCPUTime = int(
                    self.vmTypeDict[vmType]['ParametersDict']['CPUTime'])
            else:
                self.log.warn(
                    'CPU time limit is not specified for queue %s, skipping...'
                    % vmType)
                continue

            # Prepare the queue description to look for eligible jobs
            ceDict = ce.getParameterDict()

            if not siteMask:
                ceDict['JobType'] = "Test"
            if self.vo:
                ceDict['VO'] = self.vo
            if self.voGroups:
                ceDict['OwnerGroup'] = self.voGroups

            result = Resources.getCompatiblePlatforms(platform)
            if not result['OK']:
                continue
            ceDict['Platform'] = result['Value']

            ceDict['Tag'] = list(set(processorTags + vmTypeTags))

            # Get the number of eligible jobs for the target site/queue

            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.verbose('No matching TQs found for %s' % vmType)
                continue

            matchedQueues += 1
            totalTQJobs = 0
            tqIDList = taskQueueDict.keys()
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]['Jobs']

            self.log.verbose(
                '%d job(s) from %d task queue(s) are eligible for %s queue' %
                (totalTQJobs, len(tqIDList), vmType))

            # Get the number of already instantiated VMs for these task queues
            totalWaitingVMs = 0
            result = virtualMachineDB.getInstanceCounters(
                'Status', {'Endpoint': endpoint})
            if result['OK']:
                for status in result['Value']:
                    if status in ['New', 'Submitted']:
                        totalWaitingVMs += result['Value'][status]
            if totalWaitingVMs >= totalTQJobs:
                self.log.verbose("%d VMs already for all the available jobs" %
                                 totalWaitingVMs)

            self.log.verbose(
                "%d VMs for the total of %d eligible jobs for %s" %
                (totalWaitingVMs, totalTQJobs, vmType))

            # Get proxy to be used to connect to the cloud endpoint
            authType = ce.parameters.get('Auth')
            if authType and authType.lower() in ['x509', 'voms']:
                self.log.verbose("Getting cloud proxy for %s/%s" %
                                 (siteName, ceName))
                result = getProxyFileForCE(ce)
                if not result['OK']:
                    continue
                ce.setProxy(result['Value'])

            # Get the number of available slots on the target site/endpoint
            totalSlots = self.getVMInstances(endpoint, maxInstances)
            if totalSlots == 0:
                self.log.debug('%s: No slots available' % vmType)
                continue

            vmsToSubmit = max(0, min(totalSlots,
                                     totalTQJobs - totalWaitingVMs))
            self.log.info('%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d' %
                          (vmType, totalSlots, totalTQJobs, totalWaitingVMs,
                           vmsToSubmit))

            # Limit the number of VM instances to create to vmsToSubmit
            vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit)
            if vmsToSubmit == 0:
                continue

            self.log.info('Going to submit %d VMs to %s queue' %
                          (vmsToSubmit, vmType))
            result = ce.createInstances(vmsToSubmit)

            #result = S_OK()
            if not result['OK']:
                self.log.error('Failed submission to queue %s:\n' % vmType,
                               result['Message'])
                self.failedVMTypes.setdefault(vmType, 0)
                self.failedVMTypes[vmType] += 1
                continue

            # Add VMs to the VirtualMachineDB
            vmDict = result['Value']
            totalSubmittedPilots += len(vmDict)
            self.log.info('Submitted %d VMs to %s@%s' %
                          (len(vmDict), vmTypeName, ceName))

            pilotList = []
            for uuID in vmDict:
                diracUUID = vmDict[uuID]['InstanceID']
                endpoint = '%s::%s' % (self.vmTypeDict[vmType]['Site'], ceName)
                result = virtualMachineDB.insertInstance(
                    uuID, vmTypeName, diracUUID, endpoint, self.vo)
                if not result['OK']:
                    continue
                for ncpu in range(vmDict[uuID]['NumberOfProcessors']):
                    pRef = 'vm://' + ceName + '/' + diracUUID + ':' + str(
                        ncpu).zfill(2)
                    pilotList.append(pRef)

            stampDict = {}
            tqPriorityList = []
            sumPriority = 0.
            for tq in taskQueueDict:
                sumPriority += taskQueueDict[tq]['Priority']
                tqPriorityList.append((tq, sumPriority))
            tqDict = {}
            for pilotID in pilotList:
                rndm = random.random() * sumPriority
                for tq, prio in tqPriorityList:
                    if rndm < prio:
                        tqID = tq
                        break
                if tqID not in tqDict:
                    tqDict[tqID] = []
                tqDict[tqID].append(pilotID)

            for tqID, pilotList in tqDict.items():
                result = pilotAgentsDB.addPilotTQReference(
                    pilotList, tqID, '', '', self.localhost, 'Cloud',
                    stampDict)
                if not result['OK']:
                    self.log.error(
                        'Failed to insert pilots into the PilotAgentsDB: %s' %
                        result['Message'])

        self.log.info(
            "%d VMs submitted in total in this cycle, %d matched queues" %
            (totalSubmittedPilots, matchedQueues))
        return S_OK()
Beispiel #6
0
    def createVMs(self):
        """Go through defined computing elements and submit jobs if necessary"""

        vmTypeList = list(self.vmTypeDict.keys())

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {"Setup": setup, "CPUTime": 9999999}
        if self.vo:
            tqDict["VO"] = self.vo
        if self.voGroups:
            tqDict["OwnerGroup"] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result["OK"]:
            return result
        tqDict["Platform"] = result["Value"]
        tqDict["Site"] = self.sites
        tags = []
        for vmType in vmTypeList:
            if "Tag" in self.vmTypeDict[vmType]["ParametersDict"]:
                tags += self.vmTypeDict[vmType]["ParametersDict"]["Tag"]
        tqDict["Tag"] = list(set(tags))

        self.log.verbose("Checking overall TQ availability with requirements")
        self.log.verbose(tqDict)

        matcherClient = MatcherClient()
        result = matcherClient.getMatchingTaskQueues(tqDict)
        if not result["OK"]:
            return result
        if not result["Value"]:
            self.log.verbose("No Waiting jobs suitable for the director")
            return S_OK()

        jobSites = set()
        anySite = False
        testSites = set()
        totalWaitingJobs = 0
        for tqID in result["Value"]:
            if "Sites" in result["Value"][tqID]:
                for site in result["Value"][tqID]["Sites"]:
                    if site.lower() != "any":
                        jobSites.add(site)
                    else:
                        anySite = True
            else:
                anySite = True
            if "JobTypes" in result["Value"][tqID]:
                if "Sites" in result["Value"][tqID]:
                    for site in result["Value"][tqID]["Sites"]:
                        if site.lower() != "any":
                            testSites.add(site)
            totalWaitingJobs += result["Value"][tqID]["Jobs"]

        tqIDList = list(result["Value"].keys())

        result = virtualMachineDB.getInstanceCounters("Status", {})
        totalVMs = 0
        if result["OK"]:
            for status in result["Value"]:
                if status in ["New", "Submitted", "Running"]:
                    totalVMs += result["Value"][status]
        self.log.info("Total %d jobs in %d task queues with %d VMs" % (totalWaitingJobs, len(tqIDList), totalVMs))

        # Check if the site is allowed in the mask
        result = self.siteClient.getUsableSites()
        if not result["OK"]:
            return S_ERROR("Can not get the site mask")
        siteMaskList = result.get("Value", [])

        vmTypeList = list(self.vmTypeDict.keys())
        random.shuffle(vmTypeList)
        totalSubmittedPilots = 0
        matchedQueues = 0
        for vmType in vmTypeList:
            ce = self.vmTypeDict[vmType]["CE"]
            ceName = self.vmTypeDict[vmType]["CEName"]
            vmTypeName = self.vmTypeDict[vmType]["VMType"]
            siteName = self.vmTypeDict[vmType]["Site"]
            platform = self.vmTypeDict[vmType]["Platform"]
            vmTypeTags = self.vmTypeDict[vmType]["ParametersDict"].get("Tag", [])
            siteMask = siteName in siteMaskList
            endpoint = "%s::%s" % (siteName, ceName)
            maxInstances = int(self.vmTypeDict[vmType]["MaxInstances"])
            processorTags = []

            # vms support WholeNode naturally
            processorTags.append("WholeNode")

            if not anySite and siteName not in jobSites:
                self.log.verbose("Skipping queue %s at %s: no workload expected" % (vmTypeName, siteName))
                continue
            if not siteMask and siteName not in testSites:
                self.log.verbose("Skipping queue %s: site %s not in the mask" % (vmTypeName, siteName))
                continue

            if "CPUTime" in self.vmTypeDict[vmType]["ParametersDict"]:
                vmTypeCPUTime = int(self.vmTypeDict[vmType]["ParametersDict"]["CPUTime"])
            else:
                self.log.warn("CPU time limit is not specified for queue %s, skipping..." % vmType)
                continue

            # Prepare the queue description to look for eligible jobs
            ceDict = ce.getParameterDict()

            if not siteMask:
                ceDict["JobType"] = "Test"
            if self.vo:
                ceDict["VO"] = self.vo
            if self.voGroups:
                ceDict["OwnerGroup"] = self.voGroups

            result = Resources.getCompatiblePlatforms(platform)
            if not result["OK"]:
                continue
            ceDict["Platform"] = result["Value"]

            ceDict["Tag"] = list(set(processorTags + vmTypeTags))

            # Get the number of eligible jobs for the target site/queue

            result = matcherClient.getMatchingTaskQueues(ceDict)
            if not result["OK"]:
                self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"])
                return result
            taskQueueDict = result["Value"]
            if not taskQueueDict:
                self.log.verbose("No matching TQs found for %s" % vmType)
                continue

            matchedQueues += 1
            totalTQJobs = 0
            tqIDList = list(taskQueueDict.keys())
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]["Jobs"]

            self.log.verbose(
                "%d job(s) from %d task queue(s) are eligible for %s queue" % (totalTQJobs, len(tqIDList), vmType)
            )

            # Get the number of already instantiated VMs for these task queues
            totalWaitingVMs = 0
            result = virtualMachineDB.getInstanceCounters("Status", {"Endpoint": endpoint})
            if result["OK"]:
                for status in result["Value"]:
                    if status in ["New", "Submitted"]:
                        totalWaitingVMs += result["Value"][status]
            if totalWaitingVMs >= totalTQJobs:
                self.log.verbose("%d VMs already for all the available jobs" % totalWaitingVMs)

            self.log.verbose("%d VMs for the total of %d eligible jobs for %s" % (totalWaitingVMs, totalTQJobs, vmType))

            # Get proxy to be used to connect to the cloud endpoint
            authType = ce.parameters.get("Auth")
            if authType and authType.lower() in ["x509", "voms"]:
                self.log.verbose("Getting cloud proxy for %s/%s" % (siteName, ceName))
                result = getProxyFileForCloud(ce)
                if not result["OK"]:
                    continue
                ce.setProxy(result["Value"])

            # Get the number of available slots on the target site/endpoint
            totalSlots = self.getVMInstances(endpoint, maxInstances)
            if totalSlots == 0:
                self.log.debug("%s: No slots available" % vmType)
                continue

            vmsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingVMs))
            self.log.info(
                "%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d"
                % (vmType, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit)
            )

            # Limit the number of VM instances to create to vmsToSubmit
            vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit)
            if vmsToSubmit == 0:
                continue

            self.log.info("Going to submit %d VMs to %s queue" % (vmsToSubmit, vmType))
            result = ce.createInstances(vmsToSubmit)

            # result = S_OK()
            if not result["OK"]:
                self.log.error("Failed submission to queue %s:\n" % vmType, result["Message"])
                self.failedVMTypes.setdefault(vmType, 0)
                self.failedVMTypes[vmType] += 1
                continue

            # Add VMs to the VirtualMachineDB
            vmDict = result["Value"]
            totalSubmittedPilots += len(vmDict)
            self.log.info("Submitted %d VMs to %s@%s" % (len(vmDict), vmTypeName, ceName))

            pilotList = []
            for uuID in vmDict:
                diracUUID = vmDict[uuID]["InstanceID"]
                endpoint = "%s::%s" % (self.vmTypeDict[vmType]["Site"], ceName)
                result = virtualMachineDB.insertInstance(uuID, vmTypeName, diracUUID, endpoint, self.vo)
                if not result["OK"]:
                    continue
                pRef = "vm://" + ceName + "/" + diracUUID + ":00"
                pilotList.append(pRef)

            stampDict = {}
            tqPriorityList = []
            sumPriority = 0.0
            for tq in taskQueueDict:
                sumPriority += taskQueueDict[tq]["Priority"]
                tqPriorityList.append((tq, sumPriority))
            tqDict = {}
            for pilotID in pilotList:
                rndm = random.random() * sumPriority
                for tq, prio in tqPriorityList:
                    if rndm < prio:
                        tqID = tq
                        break
                if tqID not in tqDict:
                    tqDict[tqID] = []
                tqDict[tqID].append(pilotID)

            for tqID, pilotList in tqDict.items():
                result = pilotAgentsDB.addPilotTQReference(pilotList, tqID, "", "", self.localhost, "Cloud", stampDict)
                if not result["OK"]:
                    self.log.error("Failed to insert pilots into the PilotAgentsDB: %s" % result["Message"])

        self.log.info(
            "%d VMs submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)
        )
        return S_OK()
Beispiel #7
0
    def createVMs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {'Setup': setup, 'CPUTime': 9999999}
        if self.vo:
            tqDict['Community'] = self.vo
        if self.voGroups:
            tqDict['OwnerGroup'] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result['OK']:
            return result
        tqDict['Platform'] = result['Value']
        tqDict['Site'] = self.sites
        tqDict['Tag'] = []
        self.log.verbose('Checking overall TQ availability with requirements')
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result['OK']:
            return result
        if not result['Value']:
            self.log.verbose('No Waiting jobs suitable for the director')
            return S_OK()

        jobSites = set()
        anySite = False
        testSites = set()
        totalWaitingJobs = 0
        for tqID in result['Value']:
            if "Sites" in result['Value'][tqID]:
                for site in result['Value'][tqID]['Sites']:
                    if site.lower() != 'any':
                        jobSites.add(site)
                    else:
                        anySite = True
            else:
                anySite = True
            if "JobTypes" in result['Value'][tqID]:
                if "Sites" in result['Value'][tqID]:
                    for site in result['Value'][tqID]['Sites']:
                        if site.lower() != 'any':
                            testSites.add(site)
            totalWaitingJobs += result['Value'][tqID]['Jobs']

        tqIDList = result['Value'].keys()

        result = virtualMachineDB.getInstanceCounters('Status', {})
        totalVMs = 0
        if result['OK']:
            for status in result['Value']:
                if status in ['New', 'Submitted', 'Running']:
                    totalVMs += result['Value'][status]
        self.log.info('Total %d jobs in %d task queues with %d VMs' %
                      (totalWaitingJobs, len(tqIDList), totalVMs))

        # Check if the site is allowed in the mask
        result = jobDB.getSiteMask()
        if not result['OK']:
            return S_ERROR('Can not get the site mask')
        siteMaskList = result['Value']

        images = self.imageDict.keys()
        random.shuffle(images)
        totalSubmittedPilots = 0
        matchedQueues = 0
        for image in images:

            # Check if the image failed previously
            #failedCount = self.failedImages[ image ] % self.failedImageCycleFactor
            #if failedCount != 0:
            #  self.log.warn( "%s queue failed recently, skipping %d cycles" % ( image, 10-failedCount ) )
            #  self.failedImages[image] += 1
            #  continue

            print "AT >>> image parameters:", image
            for key, value in self.imageDict[image].items():
                print key, value

            ce = self.imageDict[image]['CE']
            ceName = self.imageDict[image]['CEName']
            imageName = self.imageDict[image]['ImageName']
            siteName = self.imageDict[image]['Site']
            platform = self.imageDict[image]['Platform']
            siteMask = siteName in siteMaskList
            endpoint = "%s::%s" % (siteName, ceName)
            maxInstances = int(self.imageDict[image]['MaxInstances'])

            if not anySite and siteName not in jobSites:
                self.log.verbose(
                    "Skipping queue %s at %s: no workload expected" %
                    (imageName, siteName))
                continue
            if not siteMask and siteName not in testSites:
                self.log.verbose("Skipping queue %s: site %s not in the mask" %
                                 (imageName, siteName))
                continue

            if 'CPUTime' in self.imageDict[image]['ParametersDict']:
                imageCPUTime = int(
                    self.imageDict[image]['ParametersDict']['CPUTime'])
            else:
                self.log.warn(
                    'CPU time limit is not specified for queue %s, skipping...'
                    % image)
                continue

            # Prepare the queue description to look for eligible jobs
            ceDict = ce.getParameterDict()

            if not siteMask:
                ceDict['JobType'] = "Test"
            if self.vo:
                ceDict['VO'] = self.vo
            if self.voGroups:
                ceDict['OwnerGroup'] = self.voGroups

            result = Resources.getCompatiblePlatforms(platform)
            if not result['OK']:
                continue
            ceDict['Platform'] = result['Value']

            # Get the number of eligible jobs for the target site/queue

            print "AT >>> getMatchingTaskQueues ceDict", ceDict

            result = rpcMatcher.getMatchingTaskQueues(ceDict)

            print result

            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.verbose('No matching TQs found for %s' % image)
                continue

            matchedQueues += 1
            totalTQJobs = 0
            tqIDList = taskQueueDict.keys()
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]['Jobs']

            self.log.verbose(
                '%d job(s) from %d task queue(s) are eligible for %s queue' %
                (totalTQJobs, len(tqIDList), image))

            # Get the number of already instantiated VMs for these task queues
            totalWaitingVMs = 0
            result = virtualMachineDB.getInstanceCounters(
                'Status', {'Endpoint': endpoint})
            if result['OK']:
                for status in result['Value']:
                    if status in ['New', 'Submitted']:
                        totalWaitingVMs += result['Value'][status]
            if totalWaitingVMs >= totalTQJobs:
                self.log.verbose("%d VMs already for all the available jobs" %
                                 totalWaitingVMs)

            self.log.verbose(
                "%d VMs for the total of %d eligible jobs for %s" %
                (totalWaitingVMs, totalTQJobs, image))

            # Get the working proxy
            #cpuTime = imageCPUTime + 86400
            #self.log.verbose( "Getting cloud proxy for %s/%s %d long" % ( self.cloudDN, self.cloudGroup, cpuTime ) )
            #result = gProxyManager.getPilotProxyFromDIRACGroup( self.cloudDN, self.cloudGroup, cpuTime )
            #if not result['OK']:
            #  return result
            #self.proxy = result['Value']
            #ce.setProxy( self.proxy, cpuTime - 60 )

            # Get the number of available slots on the target site/endpoint
            totalSlots = self.getVMInstances(endpoint, maxInstances)
            if totalSlots == 0:
                self.log.debug('%s: No slots available' % image)
                continue

            vmsToSubmit = max(0, min(totalSlots,
                                     totalTQJobs - totalWaitingVMs))
            self.log.info( '%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d' % \
                                    ( image, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit ) )

            # Limit the number of clouds to submit to MAX_PILOTS_TO_SUBMIT
            vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit)

            self.log.info('Going to submit %d VMs to %s queue' %
                          (vmsToSubmit, image))
            result = ce.createInstances(vmsToSubmit)

            print "AT >>> createInstances", result, image

            if not result['OK']:
                self.log.error('Failed submission to queue %s:\n' % image,
                               result['Message'])
                self.failedImages.setdefault(image, 0)
                self.failedImages[image] += 1
                continue

            # Add VMs to the VirtualMachineDB
            vmDict = result['Value']
            totalSubmittedPilots += len(vmDict)
            self.log.info('Submitted %d VMs to %s@%s' %
                          (len(vmDict), imageName, ceName))

            pilotList = []
            for uuID in vmDict:
                diracUUID = vmDict[uuID]['InstanceID']
                endpoint = '%s::%s' % (self.imageDict[image]['Site'], ceName)
                result = virtualMachineDB.insertInstance(
                    uuID, imageName, diracUUID, endpoint, self.vo)
                if not result['OK']:
                    continue
                for ncpu in range(vmDict[uuID]['NumberOfCPUs']):
                    pRef = 'vm://' + ceName + '/' + diracUUID + ':' + str(
                        ncpu).zfill(2)
                    pilotList.append(pRef)

            stampDict = {}
            tqPriorityList = []
            sumPriority = 0.
            for tq in taskQueueDict:
                sumPriority += taskQueueDict[tq]['Priority']
                tqPriorityList.append((tq, sumPriority))
            tqDict = {}
            for pilotID in pilotList:
                rndm = random.random() * sumPriority
                for tq, prio in tqPriorityList:
                    if rndm < prio:
                        tqID = tq
                        break
                if not tqDict.has_key(tqID):
                    tqDict[tqID] = []
                tqDict[tqID].append(pilotID)

            for tqID, pilotList in tqDict.items():
                result = pilotAgentsDB.addPilotTQReference(
                    pilotList, tqID, '', '', self.localhost, 'Cloud', '',
                    stampDict)
                if not result['OK']:
                    self.log.error(
                        'Failed to insert pilots into the PilotAgentsDB')

        self.log.info(
            "%d VMs submitted in total in this cycle, %d matched queues" %
            (totalSubmittedPilots, matchedQueues))
        return S_OK()