Ejemplo n.º 1
0
 def submitInstance( self, imageName, workDir ):
   """
   """
   self.log.info( 'Submitting', imageName )
   if imageName not in self.images:
     return DIRAC.S_ERROR( 'Unknown Image: %s' % imageName )
   retDict = virtualMachineDB.insertInstance( imageName, imageName )
   if not retDict['OK']:
     return retDict
   instanceID = retDict['Value']
   retDict = self._submitInstance( imageName, workDir )
   if not retDict['OK']:
     return retDict
   uniqueID = retDict[ 'Value' ]
   retDict = virtualMachineDB.setInstanceUniqueID( instanceID, uniqueID )
   if not retDict['OK']:
     return retDict
   retDict = virtualMachineDB.declareInstanceSubmitted( uniqueID )
   if not retDict['OK']:
     return retDict
   return DIRAC.S_OK( imageName )
Ejemplo n.º 2
0
    def createVMs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        vmTypeList = self.vmTypeDict.keys()

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {'Setup': setup, 'CPUTime': 9999999}
        if self.vo:
            tqDict['VO'] = self.vo
        if self.voGroups:
            tqDict['OwnerGroup'] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result['OK']:
            return result
        tqDict['Platform'] = result['Value']
        tqDict['Site'] = self.sites
        tags = []
        for vmType in vmTypeList:
            if 'Tag' in self.vmTypeDict[vmType]['ParametersDict']:
                tags += self.vmTypeDict[vmType]['ParametersDict']['Tag']
        tqDict['Tag'] = list(set(tags))
        tqDict['SubmitPool'] = "wenmrPool"

        self.log.verbose('Checking overall TQ availability with requirements')
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result['OK']:
            return result
        if not result['Value']:
            self.log.verbose('No Waiting jobs suitable for the director')
            return S_OK()

        jobSites = set()
        anySite = False
        testSites = set()
        totalWaitingJobs = 0
        for tqID in result['Value']:
            if "Sites" in result['Value'][tqID]:
                for site in result['Value'][tqID]['Sites']:
                    if site.lower() != 'any':
                        jobSites.add(site)
                    else:
                        anySite = True
            else:
                anySite = True
            if "JobTypes" in result['Value'][tqID]:
                if "Sites" in result['Value'][tqID]:
                    for site in result['Value'][tqID]['Sites']:
                        if site.lower() != 'any':
                            testSites.add(site)
            totalWaitingJobs += result['Value'][tqID]['Jobs']

        tqIDList = result['Value'].keys()

        result = virtualMachineDB.getInstanceCounters('Status', {})
        totalVMs = 0
        if result['OK']:
            for status in result['Value']:
                if status in ['New', 'Submitted', 'Running']:
                    totalVMs += result['Value'][status]
        self.log.info('Total %d jobs in %d task queues with %d VMs' %
                      (totalWaitingJobs, len(tqIDList), totalVMs))

        # Check if the site is allowed in the mask
        result = self.wmsClient.getSiteMask()
        if not result['OK']:
            return S_ERROR('Can not get the site mask')
        siteMaskList = result['Value']

        vmTypeList = self.vmTypeDict.keys()
        random.shuffle(vmTypeList)
        totalSubmittedPilots = 0
        matchedQueues = 0
        for vmType in vmTypeList:
            ce = self.vmTypeDict[vmType]['CE']
            ceName = self.vmTypeDict[vmType]['CEName']
            vmTypeName = self.vmTypeDict[vmType]['VMType']
            siteName = self.vmTypeDict[vmType]['Site']
            platform = self.vmTypeDict[vmType]['Platform']
            vmTypeTags = self.vmTypeDict[vmType]['ParametersDict'].get(
                'Tag', [])
            siteMask = siteName in siteMaskList
            endpoint = "%s::%s" % (siteName, ceName)
            maxInstances = int(self.vmTypeDict[vmType]['MaxInstances'])
            processorTags = []

            # vms support WholeNode naturally
            processorTags.append('WholeNode')

            if not anySite and siteName not in jobSites:
                self.log.verbose(
                    "Skipping queue %s at %s: no workload expected" %
                    (vmTypeName, siteName))
                continue
            if not siteMask and siteName not in testSites:
                self.log.verbose("Skipping queue %s: site %s not in the mask" %
                                 (vmTypeName, siteName))
                continue

            if 'CPUTime' in self.vmTypeDict[vmType]['ParametersDict']:
                vmTypeCPUTime = int(
                    self.vmTypeDict[vmType]['ParametersDict']['CPUTime'])
            else:
                self.log.warn(
                    'CPU time limit is not specified for queue %s, skipping...'
                    % vmType)
                continue

            # Prepare the queue description to look for eligible jobs
            ceDict = ce.getParameterDict()

            if not siteMask:
                ceDict['JobType'] = "Test"
            if self.vo:
                ceDict['VO'] = self.vo
            if self.voGroups:
                ceDict['OwnerGroup'] = self.voGroups

            result = Resources.getCompatiblePlatforms(platform)
            if not result['OK']:
                continue
            ceDict['Platform'] = result['Value']

            ceDict['Tag'] = list(set(processorTags + vmTypeTags))

            # Get the number of eligible jobs for the target site/queue

            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.verbose('No matching TQs found for %s' % vmType)
                continue

            matchedQueues += 1
            totalTQJobs = 0
            tqIDList = taskQueueDict.keys()
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]['Jobs']

            self.log.verbose(
                '%d job(s) from %d task queue(s) are eligible for %s queue' %
                (totalTQJobs, len(tqIDList), vmType))

            # Get the number of already instantiated VMs for these task queues
            totalWaitingVMs = 0
            result = virtualMachineDB.getInstanceCounters(
                'Status', {'Endpoint': endpoint})
            if result['OK']:
                for status in result['Value']:
                    if status in ['New', 'Submitted']:
                        totalWaitingVMs += result['Value'][status]
            if totalWaitingVMs >= totalTQJobs:
                self.log.verbose("%d VMs already for all the available jobs" %
                                 totalWaitingVMs)

            self.log.verbose(
                "%d VMs for the total of %d eligible jobs for %s" %
                (totalWaitingVMs, totalTQJobs, vmType))

            # Get proxy to be used to connect to the cloud endpoint
            authType = ce.parameters.get('Auth')
            if authType and authType.lower() in ['x509', 'voms']:
                self.log.verbose("Getting cloud proxy for %s/%s" %
                                 (siteName, ceName))
                result = getProxyFileForCE(ce)
                if not result['OK']:
                    continue
                ce.setProxy(result['Value'])

            # Get the number of available slots on the target site/endpoint
            totalSlots = self.getVMInstances(endpoint, maxInstances)
            if totalSlots == 0:
                self.log.debug('%s: No slots available' % vmType)
                continue

            vmsToSubmit = max(0, min(totalSlots,
                                     totalTQJobs - totalWaitingVMs))
            self.log.info('%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d' %
                          (vmType, totalSlots, totalTQJobs, totalWaitingVMs,
                           vmsToSubmit))

            # Limit the number of VM instances to create to vmsToSubmit
            vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit)
            if vmsToSubmit == 0:
                continue

            self.log.info('Going to submit %d VMs to %s queue' %
                          (vmsToSubmit, vmType))
            result = ce.createInstances(vmsToSubmit)

            #result = S_OK()
            if not result['OK']:
                self.log.error('Failed submission to queue %s:\n' % vmType,
                               result['Message'])
                self.failedVMTypes.setdefault(vmType, 0)
                self.failedVMTypes[vmType] += 1
                continue

            # Add VMs to the VirtualMachineDB
            vmDict = result['Value']
            totalSubmittedPilots += len(vmDict)
            self.log.info('Submitted %d VMs to %s@%s' %
                          (len(vmDict), vmTypeName, ceName))

            pilotList = []
            for uuID in vmDict:
                diracUUID = vmDict[uuID]['InstanceID']
                endpoint = '%s::%s' % (self.vmTypeDict[vmType]['Site'], ceName)
                result = virtualMachineDB.insertInstance(
                    uuID, vmTypeName, diracUUID, endpoint, self.vo)
                if not result['OK']:
                    continue
                for ncpu in range(vmDict[uuID]['NumberOfProcessors']):
                    pRef = 'vm://' + ceName + '/' + diracUUID + ':' + str(
                        ncpu).zfill(2)
                    pilotList.append(pRef)

            stampDict = {}
            tqPriorityList = []
            sumPriority = 0.
            for tq in taskQueueDict:
                sumPriority += taskQueueDict[tq]['Priority']
                tqPriorityList.append((tq, sumPriority))
            tqDict = {}
            for pilotID in pilotList:
                rndm = random.random() * sumPriority
                for tq, prio in tqPriorityList:
                    if rndm < prio:
                        tqID = tq
                        break
                if tqID not in tqDict:
                    tqDict[tqID] = []
                tqDict[tqID].append(pilotID)

            for tqID, pilotList in tqDict.items():
                result = pilotAgentsDB.addPilotTQReference(
                    pilotList, tqID, '', '', self.localhost, 'Cloud',
                    stampDict)
                if not result['OK']:
                    self.log.error(
                        'Failed to insert pilots into the PilotAgentsDB: %s' %
                        result['Message'])

        self.log.info(
            "%d VMs submitted in total in this cycle, %d matched queues" %
            (totalSubmittedPilots, matchedQueues))
        return S_OK()
Ejemplo n.º 3
0
  def submitInstance( self, imageName, endpoint, numVMsToSubmit, runningPodName ):
    """
    """
    # warning: instanceID is the DIRAC instance id, while uniqueID is unique for a particular endpoint
    self.log.info( '*** Preparing to submitting VM of image: ', imageName )
    self.log.info( '******* num of VMs to sumbit: ', numVMsToSubmit )
    self.log.info( '******* of running pod: ', runningPodName )
    self.log.info( '******* destination: ', endpoint )
    if runningPodName not in self.runningPods:
      return S_ERROR( 'Unknown Running Pod: %s' % runningPodName )

    for numVM in range(1,numVMsToSubmit+1):
      self.log.info( '********** Preparing to submitting VM number %s of %s VMs' % ( numVM, numVMsToSubmit ) )

      dictVMSubmitted = {}
      dictVMDBrecord = {}

      # FIRST, insert the instance into the DB !
      newInstance = virtualMachineDB.insertInstance( imageName, imageName, endpoint, runningPodName )
      if not newInstance[ 'OK' ]:
        return newInstance
      instanceID = newInstance[ 'Value' ]

      runningRequirementsDict = self.runningPods[runningPodName]['Requirements']
      cpuTime = runningRequirementsDict['CPUTime']
      if not cpuTime:
        return S_ERROR( 'Unknown CPUTime in Requirements of the RunningPod %s' % runningPodName )

      submitPool = runningRequirementsDict['SubmitPool']
      if not submitPool:
        return S_ERROR( 'Unknown submitPool in Requirements of the RunningPod %s' % runningPodName )

      dictVMSubmitted = self._submitInstance( imageName, endpoint, instanceID, runningRequirementsDict )
      if not dictVMSubmitted[ 'OK' ]:
        return dictVMSubmitted

      #########CloudStack2 adn CloudStack3 drivers have the bug of a single VM creation produces two VMs
      #########To deal with this CloudStack preaty feature we first startNewInstance inside 
      #########VMDIRECTOR._submitInstance, and second we declare two VMs 
      #########CloudStack check to preaty feature
      driver = gConfig.getValue( "/Resources/VirtualMachines/CloudEndpoints/%s/%s" % ( endpoint, "cloudDriver" ) )
      if driver == "CloudStack":
        virtualMachineDB.insertInstance( imageName, imageName, endpoint, runningPodName )

      if driver == "nova-1.1" or driver =="rocci-1.1":
        ( uniqueID, publicIP ) = dictVMSubmitted['Value']
        dictVMDBrecord = virtualMachineDB.setPublicIP( instanceID, publicIP )
        if not dictVMDBrecord['OK']:
          return dictVMDBrecord
      else: 
        uniqueID = dictVMSubmitted['Value']


      dictVMDBrecord = virtualMachineDB.setInstanceUniqueID( instanceID, uniqueID )
      if not dictVMDBrecord['OK']:
        return dictVMDBrecord

      #########CloudStack check to preaty feature
      if driver == "CloudStack":
        virtualMachineDB.setInstanceUniqueID( str( int( instanceID ) + 1 ), str( int( uniqueID ) - 1 ) )

      # check contextMethod and update status if need ssh contextualization:
      contextMethod = gConfig.getValue( "/Resources/VirtualMachines/Images/%s/%s" % ( imageName, "contextMethod" ) )
      if contextMethod == 'ssh':
        dictVMDBrecord = virtualMachineDB.declareInstanceWait_ssh_context( uniqueID )
        if not dictVMDBrecord['OK']:
          return dictVMDBrecord
      else:
        dictVMDBrecord = virtualMachineDB.declareInstanceSubmitted( uniqueID )
        if not dictVMDBrecord['OK']:
          return dictVMDBrecord

      #########CloudStack check to preaty feature
      if driver == "CloudStack":
        dictVMDBrecord = virtualMachineDB.declareInstanceSubmitted( str( int( uniqueID ) - 1 ) )

    return S_OK( imageName )
Ejemplo n.º 4
0
    def createVMs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {'Setup': setup, 'CPUTime': 9999999}
        if self.vo:
            tqDict['Community'] = self.vo
        if self.voGroups:
            tqDict['OwnerGroup'] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result['OK']:
            return result
        tqDict['Platform'] = result['Value']
        tqDict['Site'] = self.sites
        tqDict['Tag'] = []
        self.log.verbose('Checking overall TQ availability with requirements')
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result['OK']:
            return result
        if not result['Value']:
            self.log.verbose('No Waiting jobs suitable for the director')
            return S_OK()

        jobSites = set()
        anySite = False
        testSites = set()
        totalWaitingJobs = 0
        for tqID in result['Value']:
            if "Sites" in result['Value'][tqID]:
                for site in result['Value'][tqID]['Sites']:
                    if site.lower() != 'any':
                        jobSites.add(site)
                    else:
                        anySite = True
            else:
                anySite = True
            if "JobTypes" in result['Value'][tqID]:
                if "Sites" in result['Value'][tqID]:
                    for site in result['Value'][tqID]['Sites']:
                        if site.lower() != 'any':
                            testSites.add(site)
            totalWaitingJobs += result['Value'][tqID]['Jobs']

        tqIDList = result['Value'].keys()

        result = virtualMachineDB.getInstanceCounters('Status', {})
        totalVMs = 0
        if result['OK']:
            for status in result['Value']:
                if status in ['New', 'Submitted', 'Running']:
                    totalVMs += result['Value'][status]
        self.log.info('Total %d jobs in %d task queues with %d VMs' %
                      (totalWaitingJobs, len(tqIDList), totalVMs))

        # Check if the site is allowed in the mask
        result = jobDB.getSiteMask()
        if not result['OK']:
            return S_ERROR('Can not get the site mask')
        siteMaskList = result['Value']

        images = self.imageDict.keys()
        random.shuffle(images)
        totalSubmittedPilots = 0
        matchedQueues = 0
        for image in images:

            # Check if the image failed previously
            #failedCount = self.failedImages[ image ] % self.failedImageCycleFactor
            #if failedCount != 0:
            #  self.log.warn( "%s queue failed recently, skipping %d cycles" % ( image, 10-failedCount ) )
            #  self.failedImages[image] += 1
            #  continue

            print "AT >>> image parameters:", image
            for key, value in self.imageDict[image].items():
                print key, value

            ce = self.imageDict[image]['CE']
            ceName = self.imageDict[image]['CEName']
            imageName = self.imageDict[image]['ImageName']
            siteName = self.imageDict[image]['Site']
            platform = self.imageDict[image]['Platform']
            siteMask = siteName in siteMaskList
            endpoint = "%s::%s" % (siteName, ceName)
            maxInstances = int(self.imageDict[image]['MaxInstances'])

            if not anySite and siteName not in jobSites:
                self.log.verbose(
                    "Skipping queue %s at %s: no workload expected" %
                    (imageName, siteName))
                continue
            if not siteMask and siteName not in testSites:
                self.log.verbose("Skipping queue %s: site %s not in the mask" %
                                 (imageName, siteName))
                continue

            if 'CPUTime' in self.imageDict[image]['ParametersDict']:
                imageCPUTime = int(
                    self.imageDict[image]['ParametersDict']['CPUTime'])
            else:
                self.log.warn(
                    'CPU time limit is not specified for queue %s, skipping...'
                    % image)
                continue

            # Prepare the queue description to look for eligible jobs
            ceDict = ce.getParameterDict()

            if not siteMask:
                ceDict['JobType'] = "Test"
            if self.vo:
                ceDict['VO'] = self.vo
            if self.voGroups:
                ceDict['OwnerGroup'] = self.voGroups

            result = Resources.getCompatiblePlatforms(platform)
            if not result['OK']:
                continue
            ceDict['Platform'] = result['Value']

            # Get the number of eligible jobs for the target site/queue

            print "AT >>> getMatchingTaskQueues ceDict", ceDict

            result = rpcMatcher.getMatchingTaskQueues(ceDict)

            print result

            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.verbose('No matching TQs found for %s' % image)
                continue

            matchedQueues += 1
            totalTQJobs = 0
            tqIDList = taskQueueDict.keys()
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]['Jobs']

            self.log.verbose(
                '%d job(s) from %d task queue(s) are eligible for %s queue' %
                (totalTQJobs, len(tqIDList), image))

            # Get the number of already instantiated VMs for these task queues
            totalWaitingVMs = 0
            result = virtualMachineDB.getInstanceCounters(
                'Status', {'Endpoint': endpoint})
            if result['OK']:
                for status in result['Value']:
                    if status in ['New', 'Submitted']:
                        totalWaitingVMs += result['Value'][status]
            if totalWaitingVMs >= totalTQJobs:
                self.log.verbose("%d VMs already for all the available jobs" %
                                 totalWaitingVMs)

            self.log.verbose(
                "%d VMs for the total of %d eligible jobs for %s" %
                (totalWaitingVMs, totalTQJobs, image))

            # Get the working proxy
            #cpuTime = imageCPUTime + 86400
            #self.log.verbose( "Getting cloud proxy for %s/%s %d long" % ( self.cloudDN, self.cloudGroup, cpuTime ) )
            #result = gProxyManager.getPilotProxyFromDIRACGroup( self.cloudDN, self.cloudGroup, cpuTime )
            #if not result['OK']:
            #  return result
            #self.proxy = result['Value']
            #ce.setProxy( self.proxy, cpuTime - 60 )

            # Get the number of available slots on the target site/endpoint
            totalSlots = self.getVMInstances(endpoint, maxInstances)
            if totalSlots == 0:
                self.log.debug('%s: No slots available' % image)
                continue

            vmsToSubmit = max(0, min(totalSlots,
                                     totalTQJobs - totalWaitingVMs))
            self.log.info( '%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d' % \
                                    ( image, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit ) )

            # Limit the number of clouds to submit to MAX_PILOTS_TO_SUBMIT
            vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit)

            self.log.info('Going to submit %d VMs to %s queue' %
                          (vmsToSubmit, image))
            result = ce.createInstances(vmsToSubmit)

            print "AT >>> createInstances", result, image

            if not result['OK']:
                self.log.error('Failed submission to queue %s:\n' % image,
                               result['Message'])
                self.failedImages.setdefault(image, 0)
                self.failedImages[image] += 1
                continue

            # Add VMs to the VirtualMachineDB
            vmDict = result['Value']
            totalSubmittedPilots += len(vmDict)
            self.log.info('Submitted %d VMs to %s@%s' %
                          (len(vmDict), imageName, ceName))

            pilotList = []
            for uuID in vmDict:
                diracUUID = vmDict[uuID]['InstanceID']
                endpoint = '%s::%s' % (self.imageDict[image]['Site'], ceName)
                result = virtualMachineDB.insertInstance(
                    uuID, imageName, diracUUID, endpoint, self.vo)
                if not result['OK']:
                    continue
                for ncpu in range(vmDict[uuID]['NumberOfCPUs']):
                    pRef = 'vm://' + ceName + '/' + diracUUID + ':' + str(
                        ncpu).zfill(2)
                    pilotList.append(pRef)

            stampDict = {}
            tqPriorityList = []
            sumPriority = 0.
            for tq in taskQueueDict:
                sumPriority += taskQueueDict[tq]['Priority']
                tqPriorityList.append((tq, sumPriority))
            tqDict = {}
            for pilotID in pilotList:
                rndm = random.random() * sumPriority
                for tq, prio in tqPriorityList:
                    if rndm < prio:
                        tqID = tq
                        break
                if not tqDict.has_key(tqID):
                    tqDict[tqID] = []
                tqDict[tqID].append(pilotID)

            for tqID, pilotList in tqDict.items():
                result = pilotAgentsDB.addPilotTQReference(
                    pilotList, tqID, '', '', self.localhost, 'Cloud', '',
                    stampDict)
                if not result['OK']:
                    self.log.error(
                        'Failed to insert pilots into the PilotAgentsDB')

        self.log.info(
            "%d VMs submitted in total in this cycle, %d matched queues" %
            (totalSubmittedPilots, matchedQueues))
        return S_OK()