def execute( self ):
    """Main Agent code:
      1.- Query TaskQueueDB for existing TQs
      2.- Count Pending Jobs
      3.- Submit VMs
    """

    self.__checkSubmitPools()

    imagesToSubmit = {}

    for directorName, directorDict in self.directors.items():
      self.log.verbose( 'Checking Director:', directorName )
      for imageName in directorDict['director'].images:
        imageDict = directorDict['director'].images[imageName]
        instances = 0
        result = virtualMachineDB.getInstancesByStatus( 'Running' )
        if result['OK'] and imageName in result['Value']:
          instances += len( result['Value'][imageName] )
        result = virtualMachineDB.getInstancesByStatus( 'Submitted' )
        if result['OK'] and imageName in result['Value']:
          instances += len( result['Value'][imageName] )
        self.log.verbose( 'Checking Image %s:' % imageName, instances )
        maxInstances = imageDict['MaxInstances']
        if instances >= maxInstances:
          self.log.info( '%s >= %s Running instances of %s, skipping' % ( instances, maxInstances, imageName ) )
          continue

        imageRequirementsDict = imageDict['RequirementsDict']
        result = taskQueueDB.getMatchingTaskQueues( imageRequirementsDict )
        if not result['OK']:
          self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] )
          return result
        taskQueueDict = result['Value']
        jobs = 0
        priority = 0
        cpu = 0
        for tq in taskQueueDict:
          jobs += taskQueueDict[tq]['Jobs']
          priority += taskQueueDict[tq]['Priority']
          cpu += taskQueueDict[tq]['Jobs'] * taskQueueDict[tq]['CPUTime']

        if not jobs:
          self.log.info( 'No matching jobs for %s found, skipping' % imageName )
          continue

        if instances and ( cpu / instances ) < imageDict['CPUPerInstance']:
          self.log.info( 'Waiting CPU per Running instance %s < %s, skipping' % ( cpu / instances, imageDict['CPUPerInstance'] ) )
          continue

        if directorName not in imagesToSubmit:
          imagesToSubmit[directorName] = {}
        if imageName not in imagesToSubmit[directorName]:
          imagesToSubmit[directorName][imageName] = {}
        imagesToSubmit[directorName][imageName] = { 'Jobs': jobs,
                                                    'TQPriority': priority,
                                                    'CPUTime': cpu,
                                                    'VMPriority': imageDict['Priority'] }

    for directorName, imageDict in imagesToSubmit.items():
      for imageName, jobsDict in imageDict.items():
        if self.directors[directorName]['isEnabled']:
          self.log.info( 'Requesting submission of %s to %s' % ( imageName, directorName ) )

          director = self.directors[directorName]['director']
          pool = self.pools[self.directors[directorName]['pool']]

          ret = pool.generateJobAndQueueIt( director.submitInstance,
                                            args=( imageName, self.workDir ),
                                            oCallback=self.callBack,
                                            oExceptionCallback=director.exceptionCallBack,
                                            blocking=False )

          if not ret['OK']:
            # Disable submission until next iteration
            self.directors[directorName]['isEnabled'] = False
          else:
            time.sleep( self.am_getOption( 'ThreadStartDelay' ) )

    if 'Default' in self.pools:
      # only for those in "Default' thread Pool
      # for pool in self.pools:
      self.pools['Default'].processAllResults()

    return DIRAC.S_OK()
Exemple #2
0
  def submitJobs( self ):
    """ Go through defined computing elements and submit jobs if necessary
    """

    # Check if the site is allowed in the mask
    result = jobDB.getSiteMask()
    if not result['OK']:
      return S_ERROR( 'Can not get the site mask' )
    siteMaskList = result['Value']

    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      ceType = self.queueDict[queue]['CEType']
      queueName = self.queueDict[queue]['QueueName']
      siteName = self.queueDict[queue]['Site']
      siteMask = siteName in siteMaskList

      if 'CPUTime' in self.queueDict[queue]['ParametersDict'] :
        queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] )
      else:
        return S_ERROR( 'CPU time limit is not specified for queue %s' % queue )

      # Get the working proxy
      cpuTime = queueCPUTime + 86400
      result = gProxyManager.getPilotProxyFromDIRACGroup( self.genericPilotDN, self.genericPilotGroup, cpuTime )
      if not result['OK']:
        return result
      self.proxy = result['Value']
      ce.setProxy( self.proxy, cpuTime - 60 )

      result = ce.available()
      if not result['OK']:
        self.log.warn( 'Failed to check the availability of queue %s: %s' % ( queue, result['Message'] ) )
        continue

      totalSlots = result['Value']

      self.log.verbose( result['Message'] )

      ceDict = ce.getParameterDict()
      ceDict[ 'GridCE' ] = ceName
      if not siteMask and 'Site' in ceDict:
        self.log.info( 'Site not in the mask %s' % siteName )
        self.log.info( 'Removing "Site" from matching Dict' )
        del ceDict[ 'Site' ]

      result = taskQueueDB.getMatchingTaskQueues( ceDict )

      if not result['OK']:
        self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] )
        return result
      taskQueueDict = result['Value']
      if not taskQueueDict:
        self.log.verbose( 'No matching TQs found' )
        continue

      totalTQJobs = 0
      for tq in taskQueueDict:
        totalTQJobs += taskQueueDict[tq]['Jobs']

      pilotsToSubmit = min( totalSlots, totalTQJobs )
      self.log.verbose( 'Available slots=%d, TQ jobs=%d, Pilots to submit=%d' % ( totalSlots, totalTQJobs, pilotsToSubmit ) )

      if pilotsToSubmit > 0:
        self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) )

        bundleProxy = self.queueDict[queue].get( 'BundleProxy', False )
        result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy )
        if not result['OK']:
          return result

        # If proxy is not bundled in, submit with the user proxy 

        executable = result['Executable']
        proxy = result['Proxy']
        result = ce.submitJob( executable, proxy, pilotsToSubmit )
        if not result['OK']:
          self.log.error( 'Failed submission to queue %s:' % queue, result['Message'] )

        # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
        # task queue priorities
        pilotList = result['Value']
        stampDict = {}
        if result.has_key( 'PilotStampDict' ):
          stampDict = result['PilotStampDict']
        tqPriorityList = []
        sumPriority = 0.
        for tq in taskQueueDict:
          sumPriority += taskQueueDict[tq]['Priority']
          tqPriorityList.append( ( tq, sumPriority ) )
        rndm = random.random()*sumPriority
        tqDict = {}
        for pilotID in pilotList:
          rndm = random.random()*sumPriority
          for tq, prio in tqPriorityList:
            if rndm < prio:
              tqID = tq
              break
          if not tqDict.has_key( tqID ):
            tqDict[tqID] = []
          tqDict[tqID].append( pilotID )

        for tqID, pilotList in tqDict.items():
          result = pilotAgentsDB.addPilotTQReference( pilotList,
                                                     tqID,
                                                     self.genericPilotDN,
                                                     self.genericPilotGroup,
                                                     self.localhost,
                                                     ceType,
                                                     '',
                                                     stampDict )
          if not result['OK']:
            self.log.error( 'Failed add pilots to the PilotAgentsDB: %s' % result['Message'] )
            continue
          for pilot in pilotList:
            result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName,
                                                  'Successfuly submitted by the SiteDirector',
                                                  siteName, queueName )
            if not result['OK']:
              self.log.error( 'Failed to set pilot status: %s' % result['Message'] )
              continue

    return S_OK()
Exemple #3
0
    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        # Check if the site is allowed in the mask
        result = jobDB.getSiteMask()
        if not result['OK']:
            return S_ERROR('Can not get the site mask')
        siteMaskList = result['Value']

        for queue in self.queueDict:
            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            ceType = self.queueDict[queue]['CEType']
            queueName = self.queueDict[queue]['QueueName']
            siteName = self.queueDict[queue]['Site']
            siteMask = siteName in siteMaskList

            if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
                queueCPUTime = int(
                    self.queueDict[queue]['ParametersDict']['CPUTime'])
            else:
                return S_ERROR('CPU time limit is not specified for queue %s' %
                               queue)
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Get the working proxy
            cpuTime = queueCPUTime + 86400
            result = gProxyManager.getPilotProxyFromDIRACGroup(
                self.genericPilotDN, self.genericPilotGroup, cpuTime)
            if not result['OK']:
                return result
            self.proxy = result['Value']
            ce.setProxy(self.proxy, cpuTime - 60)

            result = ce.available()
            if not result['OK']:
                self.log.warn(
                    'Failed to check the availability of queue %s: %s' %
                    (queue, result['Message']))
                continue

            totalSlots = result['Value']

            self.log.verbose(result['Message'])

            ceDict = ce.getParameterDict()
            ceDict['GridCE'] = ceName
            if not siteMask and 'Site' in ceDict:
                self.log.info('Site not in the mask %s' % siteName)
                self.log.info('Removing "Site" from matching Dict')
                del ceDict['Site']

            result = taskQueueDB.getMatchingTaskQueues(ceDict)

            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.verbose('No matching TQs found')
                continue

            totalTQJobs = 0
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]['Jobs']

            pilotsToSubmit = min(totalSlots, totalTQJobs)
            self.log.verbose(
                'Available slots=%d, TQ jobs=%d, Pilots to submit=%d' %
                (totalSlots, totalTQJobs, pilotsToSubmit))

            if pilotsToSubmit > 0:
                self.log.info('Going to submit %d pilots to %s queue' %
                              (pilotsToSubmit, queue))

                bundleProxy = self.queueDict[queue].get('BundleProxy', False)
                result = self.__getExecutable(queue, pilotsToSubmit,
                                              bundleProxy)
                if not result['OK']:
                    return result

                executable = result['Value']
                result = ce.submitJob(executable, '', pilotsToSubmit)
                if not result['OK']:
                    self.log.error('Failed submission to queue %s:' % queue,
                                   result['Message'])
                    continue
                # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                # task queue priorities
                pilotList = result['Value']
                stampDict = {}
                if result.has_key('PilotStampDict'):
                    stampDict = result['PilotStampDict']
                tqPriorityList = []
                sumPriority = 0.
                for tq in taskQueueDict:
                    sumPriority += taskQueueDict[tq]['Priority']
                    tqPriorityList.append((tq, sumPriority))
                rndm = random.random() * sumPriority
                tqDict = {}
                for pilotID in pilotList:
                    rndm = random.random() * sumPriority
                    for tq, prio in tqPriorityList:
                        if rndm < prio:
                            tqID = tq
                            break
                    if not tqDict.has_key(tqID):
                        tqDict[tqID] = []
                    tqDict[tqID].append(pilotID)

                for tqID, pilotList in tqDict.items():
                    result = pilotAgentsDB.addPilotTQReference(
                        pilotList, tqID, self.genericPilotDN,
                        self.genericPilotGroup, self.localhost, ceType, '',
                        stampDict)
                    if not result['OK']:
                        self.log.error(
                            'Failed add pilots to the PilotAgentsDB: %s' %
                            result['Message'])
                        continue
                    for pilot in pilotList:
                        result = pilotAgentsDB.setPilotStatus(
                            pilot, 'Submitted', ceName,
                            'Successfuly submitted by the SiteDirector',
                            siteName, queueName)
                        if not result['OK']:
                            self.log.error('Failed to set pilot status: %s' %
                                           result['Message'])
                            continue

        return S_OK()
  def execute( self ):
    """Main Agent code:
      1.- Query TaskQueueDB for existing TQs
      2.- Add their Priorities
      3.- Submit pilots
    """

    self.__checkSubmitPools()

    self.directorDict = getResourceDict()

    result = taskQueueDB.getMatchingTaskQueues( self.directorDict )
    if not result['OK']:
      self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] )
      return result
    taskQueueDict = result['Value']

    self.log.info( 'Found %s TaskQueues' % len( taskQueueDict ) )

    if not taskQueueDict:
      self.log.info( 'No TaskQueue to Process' )
      return S_OK()

    prioritySum = 0
    waitingJobs = 0
    for taskQueueID in taskQueueDict:
      taskQueueDict[taskQueueID]['TaskQueueID'] = taskQueueID
      prioritySum += taskQueueDict[taskQueueID]['Priority']
      waitingJobs += taskQueueDict[taskQueueID]['Jobs']

    self.log.info( 'Sum of Priorities %s' % prioritySum )

    if waitingJobs == 0:
      self.log.info( 'No waiting Jobs' )
      return S_OK( 'No waiting Jobs' )
    if prioritySum <= 0:
      return S_ERROR( 'Wrong TaskQueue Priorities' )

    self.pilotsPerPriority = self.am_getOption( 'pilotsPerIteration' ) / prioritySum
    self.pilotsPerJob = self.am_getOption( 'pilotsPerIteration' ) / waitingJobs

    self.callBackLock.acquire()
    self.submittedPilots = 0
    self.callBackLock.release()
    self.toSubmitPilots = 0
    waitingStatusList = ['Submitted', 'Ready', 'Scheduled', 'Waiting']
    timeLimitToConsider = Time.toString( Time.dateTime() - Time.hour * self.am_getOption( "maxPilotWaitingHours" ) )

    for taskQueueID in taskQueueDict:
      self.log.verbose( 'Processing TaskQueue', taskQueueID )

      result = pilotAgentsDB.countPilots( { 'TaskQueueID': taskQueueID,
                                            'Status': waitingStatusList},
                                          None, timeLimitToConsider )
      if not result['OK']:
        self.log.error( 'Fail to get Number of Waiting pilots', result['Message'] )
        waitingPilots = 0
      else:
        waitingPilots = result['Value']
        self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % taskQueueID, waitingPilots )

      result = self.submitPilotsForTaskQueue( taskQueueDict[taskQueueID], waitingPilots )

      if result['OK']:
        self.toSubmitPilots += result['Value']

    self.log.info( 'Number of pilots to be Submitted %s' % self.toSubmitPilots )

    # Now wait until all Jobs in the Default ThreadPool are proccessed
    if 'Default' in self.pools:
      # only for those in "Default' thread Pool
      # for pool in self.pools:
      self.pools['Default'].processAllResults()

    self.log.info( 'Number of pilots Submitted %s' % self.submittedPilots )

    return S_OK()
  def execute( self ):
    """Main Agent code:
      1.- Query TaskQueueDB for existing TQs
      2.- Count Pending Jobs
      3.- Submit Jobs
    """
    self.__checkSubmitPools()

    bigDataJobsToSubmit = {}
    bigDataJobIdsToSubmit = {}

    for directorName, directorDict in self.directors.items():
      self.log.verbose( 'Checking Director:', directorName )
      self.log.verbose( 'RunningEndPoints:', directorDict['director'].runningEndPoints )
      for runningEndPointName in directorDict['director'].runningEndPoints:
        runningEndPointDict = directorDict['director'].runningEndPoints[runningEndPointName]
        NameNode = runningEndPointDict['NameNode']
        jobsByEndPoint = 0
        result = BigDataDB.getBigDataJobsByStatusAndEndpoint( 'Submitted', NameNode )
        if result['OK']:
          jobsByEndPoint += len( result['Value'] )
        result = BigDataDB.getBigDataJobsByStatusAndEndpoint( 'Running', NameNode )
        if result['OK']:
          jobsByEndPoint += len( result['Value'] )
        self.log.verbose( 'Checking Jobs By EndPoint %s:' % jobsByEndPoint )
        jobLimitsEndPoint = runningEndPointDict['LimitQueueJobsEndPoint']

        bigDataJobs = 0
        if jobsByEndPoint >= jobLimitsEndPoint:
          self.log.info( '%s >= %s Running jobs reach job limits: %s, skipping' % ( jobsByEndPoint, jobLimitsEndPoint, runningEndPointName ) )
          continue
        else:
          bigDataJobs = jobLimitsEndPoint - jobsByEndPoint
        requirementsDict = runningEndPointDict['Requirements']

        self.log.info( 'Requirements Dict: ', requirementsDict )
        result = taskQueueDB.getMatchingTaskQueues( requirementsDict )
        if not result['OK']:
          self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] )
          return result

        taskQueueDict = result['Value']
        self.log.info( 'Task Queues Dict: ', taskQueueDict )
        jobs = 0
        priority = 0
        cpu = 0
        jobsID = 0
        self.log.info( 'Pending Jobs from TaskQueue, which not matching before: ', self.pendingTaskQueueJobs )
        for tq in taskQueueDict:
          jobs += taskQueueDict[tq]['Jobs']
          priority += taskQueueDict[tq]['Priority']
          cpu += taskQueueDict[tq]['Jobs'] * taskQueueDict[tq]['CPUTime']

          #Matching of Jobs with BigData Softwares
          #This process is following the sequence:
          #Retrieve a job from taskqueueDict
          #Get job name and try to match with the resources        
          #If not match store the var pendingTaskQueueJobs for the
          #next iteration
          #
          #This matching is doing with the following JobName Pattern
          # NameSoftware _ SoftwareVersion _ HighLanguageName _ HighLanguageVersion _ DataSetName          
          #extract a job from the TaskQueue
          if tq not in self.pendingTaskQueueJobs.keys():
            self.pendingTaskQueueJobs[tq] = {}
          getJobFromTaskQueue = taskQueueDB.matchAndGetJob( taskQueueDict[tq] )
          if not getJobFromTaskQueue['OK']:
            self.log.error( 'Could not get Job and FromTaskQueue', getJobFromTaskQueue['Message'] )
            return getJobFromTaskQueue

          jobInfo = getJobFromTaskQueue['Value']
          jobID = jobInfo['jobId']
          jobAttrInfo = jobDB.getJobAttributes( jobID )

          if not jobAttrInfo['OK']:
            self.log.error( 'Could not get Job Attributes', jobAttrInfo['Message'] )
            return jobAttrInfo
          jobInfoUniq = jobAttrInfo['Value']
          jobName = jobInfoUniq['JobName']
          self.pendingTaskQueueJobs[tq][jobID] = jobName


          result = jobDB.getJobJDL( jobID, True )
          classAdJob = ClassAd( result['Value'] )
          arguments = 0
          if classAdJob.lookupAttribute( 'Arguments' ):
            arguments = classAdJob.getAttributeString( 'Arguments' )
          #if not classAdJob.lookupAttribute( 'Arguments' ):
          #  continue

          jobsToSubmit = self.matchingJobsForBDSubmission( arguments,
                                                       runningEndPointName,
                                                       runningEndPointDict['BigDataSoftware'],
                                                       runningEndPointDict['BigDataSoftwareVersion'],
                                                       runningEndPointDict['HighLevelLanguage']['HLLName'],
                                                       runningEndPointDict['HighLevelLanguage']['HLLVersion'],
                                                       jobID )
          if ( jobsToSubmit == "OK" ):
            if directorName not in bigDataJobsToSubmit:
              bigDataJobsToSubmit[directorName] = {}
            if runningEndPointName not in bigDataJobsToSubmit[directorName]:
              bigDataJobsToSubmit[directorName][runningEndPointName] = {}
            bigDataJobsToSubmit[directorName][runningEndPointName] = { 'JobId': jobID,
                                                        'JobName': jobName,
                                                        'TQPriority': priority,
                                                        'CPUTime': cpu,
                                                        'BigDataEndpoint': runningEndPointName,
                                                        'BigDataEndpointNameNode': runningEndPointDict['NameNode'],
                                                        'BdSoftware': runningEndPointDict['BigDataSoftware'],
                                                        'BdSoftwareVersion': runningEndPointDict['BigDataSoftwareVersion'],
                                                        'HLLName' : runningEndPointDict['HighLevelLanguage']['HLLName'],
                                                        'HLLVersion' : runningEndPointDict['HighLevelLanguage']['HLLVersion'],
                                                        'NumBigDataJobsAllowedToSubmit': bigDataJobs,
                                                        'SiteName': runningEndPointDict['SiteName'],
                                                        'PublicIP': runningEndPointDict['PublicIP'],
                                                        'User': runningEndPointDict['User'],
                                                        'Port': runningEndPointDict['Port'],
                                                        'UsePilot': runningEndPointDict['UsePilot'],
                                                        'IsInteractive': runningEndPointDict['IsInteractive'],
                                                        'Arguments': arguments }
            del self.pendingTaskQueueJobs[tq][jobID]
          else:
            self.log.error( jobsToSubmit )
        self.log.info( 'Pending Jobs from TaskQueue, which not matching after: ', self.pendingTaskQueueJobs )
        for tq in self.pendingTaskQueueJobs.keys():
          for jobid in self.pendingTaskQueueJobs[tq].keys():
            result = jobDB.getJobJDL( jobid, True )
            classAdJob = ClassAd( result['Value'] )
            arguments = 0
            if classAdJob.lookupAttribute( 'Arguments' ):
              arguments = classAdJob.getAttributeString( 'Arguments' )
            #if not classAdJob.lookupAttribute( 'Arguments' ):
            #  continue
            #do the match with the runningEndPoint
            jobsToSubmit = self.matchingJobsForBDSubmission( arguments,
                                                             runningEndPointName,
                                                             runningEndPointDict['BigDataSoftware'],
                                                             runningEndPointDict['BigDataSoftwareVersion'],
                                                             runningEndPointDict['HighLevelLanguage']['HLLName'],
                                                             runningEndPointDict['HighLevelLanguage']['HLLVersion'],
                                                             jobid )
            if ( jobsToSubmit == "OK" ):
              if directorName not in bigDataJobsToSubmit:
                bigDataJobsToSubmit[directorName] = {}
              if runningEndPointName not in bigDataJobsToSubmit[directorName]:
                bigDataJobsToSubmit[directorName][runningEndPointName] = {}
              bigDataJobsToSubmit[directorName][runningEndPointName] = { 'JobId': jobid,
                                                          'JobName': self.pendingTaskQueueJobs[tq][jobid],
                                                          'TQPriority': priority,
                                                          'CPUTime': cpu,
                                                          'BigDataEndpoint': runningEndPointName,
                                                          'BigDataEndpointNameNode': runningEndPointDict['NameNode'],
                                                          'BdSoftware': runningEndPointDict['BigDataSoftware'],
                                                          'BdSoftwareVersion': runningEndPointDict['BigDataSoftwareVersion'],
                                                          'HLLName' : runningEndPointDict['HighLevelLanguage']['HLLName'],
                                                          'HLLVersion' : runningEndPointDict['HighLevelLanguage']['HLLVersion'],
                                                          'NumBigDataJobsAllowedToSubmit': bigDataJobs,
                                                          'SiteName': runningEndPointDict['SiteName'],
                                                          'PublicIP': runningEndPointDict['PublicIP'],
                                                          'User': runningEndPointDict['User'],
                                                          'Port': runningEndPointDict['Port'],
                                                          'UsePilot': runningEndPointDict['UsePilot'],
                                                          'IsInteractive': runningEndPointDict['IsInteractive'],
                                                          'Arguments': arguments  }
              del self.pendingTaskQueueJobs[tq][jobid]
            else:
             self.log.error( jobsToSubmit )
        if not jobs and not self.pendingTaskQueueJobs:
          self.log.info( 'No matching jobs for %s found, skipping' % NameNode )
          continue

        self.log.info( '___BigDataJobsTo Submit:', bigDataJobsToSubmit )

    for directorName, JobsToSubmitDict in bigDataJobsToSubmit.items():
      for runningEndPointName, jobsToSubmitDict in JobsToSubmitDict.items():
        if self.directors[directorName]['isEnabled']:
          self.log.info( 'Requesting submission to %s of %s' % ( runningEndPointName, directorName ) )

          director = self.directors[directorName]['director']
          pool = self.pools[self.directors[directorName]['pool']]

          jobIDs = JobsToSubmitDict[runningEndPointName]['JobId']
          jobName = JobsToSubmitDict[runningEndPointName]['JobName']
          endpoint = JobsToSubmitDict[runningEndPointName]['BigDataEndpoint']
          runningSiteName = JobsToSubmitDict[runningEndPointName]['SiteName']
          NameNode = JobsToSubmitDict[runningEndPointName]['BigDataEndpointNameNode']
          BigDataSoftware = JobsToSubmitDict[runningEndPointName]['BdSoftware']
          BigDataSoftwareVersion = JobsToSubmitDict[runningEndPointName]['BdSoftwareVersion']
          HLLName = JobsToSubmitDict[runningEndPointName]['HLLName']
          HLLVersion = JobsToSubmitDict[runningEndPointName]['HLLVersion']
          PublicIP = JobsToSubmitDict[runningEndPointName]['PublicIP']
          User = JobsToSubmitDict[runningEndPointName]['User']
          Port = JobsToSubmitDict[runningEndPointName]['Port']
          UsePilot = JobsToSubmitDict[runningEndPointName]['UsePilot']
          IsInteractive = JobsToSubmitDict[runningEndPointName]['IsInteractive']
          Arguments = JobsToSubmitDict[runningEndPointName]['Arguments']
          numBigDataJobsAllowed = JobsToSubmitDict[runningEndPointName]['NumBigDataJobsAllowedToSubmit']

          ret = pool.generateJobAndQueueIt( director.submitBigDataJobs,
                                            args = ( endpoint, numBigDataJobsAllowed, runningSiteName, NameNode,
                                                     BigDataSoftware, BigDataSoftwareVersion, HLLName, HLLVersion,
                                                     PublicIP, Port, jobIDs, runningEndPointName, jobName, User, self.jobDataset, UsePilot, IsInteractive ),
                                            oCallback = self.callBack,
                                            oExceptionCallback = director.exceptionCallBack,
                                            blocking = False )
          if not ret['OK']:
            # Disable submission until next iteration
            self.directors[directorName]['isEnabled'] = False
          else:
            time.sleep( self.am_getOption( 'ThreadStartDelay' ) )

    if 'Default' in self.pools:
      # only for those in "Default' thread Pool
      # for pool in self.pools:
      self.pools['Default'].processAllResults()

    return DIRAC.S_OK()
Exemple #6
0
    def execute(self):
        """Main Agent code:
      1.- Query TaskQueueDB for existing TQs
      2.- Add their Priorities
      3.- Submit pilots
    """

        self.__checkSubmitPools()

        self.directorDict = getResourceDict()

        result = taskQueueDB.getMatchingTaskQueues(self.directorDict)
        if not result['OK']:
            self.log.error('Could not retrieve TaskQueues from TaskQueueDB',
                           result['Message'])
            return result
        taskQueueDict = result['Value']

        self.log.info('Found %s TaskQueues' % len(taskQueueDict))

        if not taskQueueDict:
            self.log.info('No TaskQueue to Process')
            return S_OK()

        prioritySum = 0
        waitingJobs = 0
        for taskQueueID in taskQueueDict:
            taskQueueDict[taskQueueID]['TaskQueueID'] = taskQueueID
            prioritySum += taskQueueDict[taskQueueID]['Priority']
            waitingJobs += taskQueueDict[taskQueueID]['Jobs']

        self.log.info('Sum of Priorities %s' % prioritySum)

        if waitingJobs == 0:
            self.log.info('No waiting Jobs')
            return S_OK('No waiting Jobs')
        if prioritySum <= 0:
            return S_ERROR('Wrong TaskQueue Priorities')

        self.pilotsPerPriority = self.am_getOption(
            'pilotsPerIteration') / prioritySum
        self.pilotsPerJob = self.am_getOption(
            'pilotsPerIteration') / waitingJobs

        self.callBackLock.acquire()
        self.submittedPilots = 0
        self.callBackLock.release()
        self.toSubmitPilots = 0
        waitingStatusList = ['Submitted', 'Ready', 'Scheduled', 'Waiting']
        timeLimitToConsider = Time.toString(
            Time.dateTime() -
            Time.hour * self.am_getOption("maxPilotWaitingHours"))

        for taskQueueID in taskQueueDict:
            self.log.verbose('Processing TaskQueue', taskQueueID)

            result = pilotAgentsDB.countPilots(
                {
                    'TaskQueueID': taskQueueID,
                    'Status': waitingStatusList
                }, None, timeLimitToConsider)
            if not result['OK']:
                self.log.error('Fail to get Number of Waiting pilots',
                               result['Message'])
                waitingPilots = 0
            else:
                waitingPilots = result['Value']
                self.log.verbose(
                    'Waiting Pilots for TaskQueue %s:' % taskQueueID,
                    waitingPilots)

            result = self.submitPilotsForTaskQueue(taskQueueDict[taskQueueID],
                                                   waitingPilots)

            if result['OK']:
                self.toSubmitPilots += result['Value']

        self.log.info('Number of pilots to be Submitted %s' %
                      self.toSubmitPilots)

        # Now wait until all Jobs in the Default ThreadPool are proccessed
        if 'Default' in self.pools:
            # only for those in "Default' thread Pool
            # for pool in self.pools:
            self.pools['Default'].processAllResults()

        self.log.info('Number of pilots Submitted %s' % self.submittedPilots)

        return S_OK()
  def execute( self ):
    """Main Agent code:
      1.- Query TaskQueueDB for existing TQs
      2.- Count Pending Jobs
      3.- Submit VMs
    """

    self.__checkSubmitPools()

    imagesToSubmit = {}

    for directorName, directorDict in self.directors.items():
      self.log.verbose( 'Checking Director:', directorName )
      for runningPodName in directorDict['director'].runningPods:
        result = virtualMachineDB.insertRunningPod(runningPodName)
        if not result['OK']:
          self.log.error( 'Error inserting/updating Running Pod %s in DB: %s' % ( runningPodName, result['Message'] ) )
          continue
        result = virtualMachineDB.setRunningPodStatus(runningPodName)
        if not result['OK']:
          self.log.error( 'Error in setRunningPodStatus %s: %s' % ( runningPodName, result['Message'] ) )
          continue
        result = virtualMachineDB.getRunningPodStatus(runningPodName)
        if not result['OK']:
          self.log.error( 'Error in getRunningPodStatus %s: %s' % ( runningPodName, result['Message'] ) )
          continue
        status = result[ 'Value' ]
        if status == 'Active':
          self.log.info( 'RunningPod %s is Active' % ( runningPodName ) )
        else:
          self.log.info( 'RunningPod %s is Unactive, do nothing' % ( runningPodName ) )
          continue
        
        runningPodDict = directorDict['director'].runningPods[runningPodName]
        imageName = runningPodDict['Image']
        instances = 0
        result = virtualMachineDB.getInstancesByStatus( 'Running' )
        if result['OK'] and imageName in result['Value']:
          instances += len( result['Value'][imageName] )
        result = virtualMachineDB.getInstancesByStatus( 'Submitted' )
        if result['OK'] and imageName in result['Value']:
          instances += len( result['Value'][imageName] )
        result = virtualMachineDB.getInstancesByStatus( 'Wait_ssh_context' )
        if result['OK'] and imageName in result['Value']:
          instances += len( result['Value'][imageName] )
        result = virtualMachineDB.getInstancesByStatus( 'Contextualizing' )
        if result['OK'] and imageName in result['Value']:
          instances += len( result['Value'][imageName] )
        self.log.verbose( 'Checking Image %s:' % imageName, instances )
        maxInstances = runningPodDict['MaxInstances']
        if instances >= maxInstances:
          self.log.info( '%s >= %s Running instances reach MaxInstances for runningPod: %s, skipping' % ( instances, maxInstances, runningPodName ) )
          continue

        cloudEndpointsStr = runningPodDict['CloudEndpoints']
	      # random
        cloudEndpoints = [element for element in cloudEndpointsStr.split( ',' )]
        shuffle( cloudEndpoints )
        self.log.info( 'cloudEndpoints random failover: %s' % cloudEndpoints )
        numVMs = 0
        numVMsToSubmit = {}
        for endpoint in cloudEndpoints:
          self.log.info( 'Checking to submit to: %s' % endpoint )
          strMaxEndpointInstances = gConfig.getValue( "/Resources/VirtualMachines/CloudEndpoints/%s/%s" % ( endpoint, 'maxEndpointInstances' ), "" )
          if not strMaxEndpointInstances:
            self.log.info( 'CS CloudEndpoint %s has no define maxEndpointInstances option' % endpoint )
            continue
          self.log.info( 'CS CloudEndpoint %s maxEndpointInstance: %s' % (endpoint,strMaxEndpointInstances) )

          vmPolicy = gConfig.getValue( "/Resources/VirtualMachines/CloudEndpoints/%s/%s" % ( endpoint, 'vmPolicy' ), "" )
          if not vmPolicy:
            self.log.info( 'CS CloudEndpoint %s has no define vmPolicy option' % endpoint )
            continue
          self.log.info( 'CS CloudEndpoint %s vmPolicy: %s' % (endpoint,vmPolicy) )

          endpointInstances = 0
          result = virtualMachineDB.getInstancesByStatusAndEndpoint( 'Running', endpoint )
          if result['OK'] and imageName in result['Value']:
            endpointInstances += len( result['Value'][imageName] )
          result = virtualMachineDB.getInstancesByStatusAndEndpoint( 'Submitted', endpoint )
          if result['OK'] and imageName in result['Value']:
            endpointInstances += len( result['Value'][imageName] )
          result = virtualMachineDB.getInstancesByStatusAndEndpoint( 'Wait_ssh_context', endpoint )
          if result['OK'] and imageName in result['Value']:
            endpointInstances += len( result['Value'][imageName] )
          result = virtualMachineDB.getInstancesByStatusAndEndpoint( 'Contextualizing', endpoint )
          if result['OK'] and imageName in result['Value']:
            endpointInstances += len( result['Value'][imageName] )
          self.log.info( 'CS CloudEndpoint %s instances: %s, maxEndpointInstances: %s' % (endpoint,endpointInstances,strMaxEndpointInstances) )
          maxEndpointInstances = int(strMaxEndpointInstances)
          if endpointInstances < maxEndpointInstances:
            if vmPolicy == 'elastic':
              numVMs = 1
            if vmPolicy == 'static':
              numVMs = maxEndpointInstances - endpointInstances
          numVMsToSubmit.update({str(endpoint): int(numVMs) })

          # site to match with TQ:
          siteToMatch = gConfig.getValue( "/Resources/VirtualMachines/CloudEndpoints/%s/%s" % ( endpoint, 'siteName' ), "" )
          runningPodRequirementsDict = runningPodDict['Requirements']
          runningPodRequirementsDict['Site'] = siteToMatch

          self.log.verbose( 'Requirements to match: ', runningPodRequirementsDict )
          result = taskQueueDB.getMatchingTaskQueues( runningPodRequirementsDict )
          if not result['OK']:
            self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] )
            return result
          taskQueueDict = result['Value']
          self.log.verbose( 'Task Queues Dict: ', taskQueueDict )
          jobs = 0
          priority = 0
          cpu = 0
          for tq in taskQueueDict:
            jobs += taskQueueDict[tq]['Jobs']
            priority += taskQueueDict[tq]['Priority']
            cpu += taskQueueDict[tq]['Jobs'] * taskQueueDict[tq]['CPUTime']

          if not jobs:
            self.log.info( 'No matching jobs for %s found, skipping' % imageName )
            continue

          if instances and ( cpu / instances ) < runningPodDict['CPUPerInstance']:
            self.log.info( 'Waiting CPU per Running instance %s < %s, skipping' % ( cpu / instances, runningPodDict['CPUPerInstance'] ) )
            break

          if directorName not in imagesToSubmit:
            imagesToSubmit[directorName] = {}
          if imageName not in imagesToSubmit[directorName]:
            imagesToSubmit[directorName][imageName] = {}
          numVMs = numVMsToSubmit.get( endpoint )
          imagesToSubmit[directorName][imageName] = { 'Jobs': jobs,
                                                      'TQPriority': priority,
                                                      'CPUTime': cpu,
                                                      'CloudEndpoint': endpoint,
                                                      'NumVMsToSubmit': numVMs,
                                                      'VMPolicy': vmPolicy,
                                                      'RunningPodName': runningPodName,
                                                      'VMPriority': runningPodDict['Priority'] }

    for directorName, imageOfJobsToSubmitDict in imagesToSubmit.items():
      for imageName, jobsToSubmitDict in imageOfJobsToSubmitDict.items():
        if self.directors[directorName]['isEnabled'] and numVMs > 0:
          self.log.info( 'Requesting submission of %s to %s' % ( imageName, directorName ) )

          director = self.directors[directorName]['director']
          pool = self.pools[self.directors[directorName]['pool']]

          endpoint = jobsToSubmitDict['CloudEndpoint']
          runningPodName = jobsToSubmitDict['RunningPodName']
          numVMs = jobsToSubmitDict['NumVMsToSubmit']

          ret = pool.generateJobAndQueueIt( director.submitInstance,
                                            args = ( imageName, endpoint, numVMs, runningPodName ),
                                            oCallback = self.callBack,
                                            oExceptionCallback = director.exceptionCallBack,
                                            blocking = False )

          if not ret['OK']:
            # Disable submission until next iteration
            self.directors[directorName]['isEnabled'] = False
          else:
            time.sleep( self.am_getOption( 'ThreadStartDelay' ) )

    if 'Default' in self.pools:
      # only for those in "Default' thread Pool
      # for pool in self.pools:
      self.pools['Default'].processAllResults()

    return DIRAC.S_OK()
Exemple #8
0
    def execute(self):
        """Main Agent code:
      1.- Query TaskQueueDB for existing TQs
      2.- Count Pending Jobs
      3.- Submit VMs
    """

        self.__checkSubmitPools()

        imagesToSubmit = {}

        for directorName, directorDict in self.directors.items():
            self.log.verbose('Checking Director:', directorName)
            for imageName in directorDict['director'].images:
                imageDict = directorDict['director'].images[imageName]
                instances = 0
                result = virtualMachineDB.getInstancesByStatus('Running')
                if result['OK'] and imageName in result['Value']:
                    instances += len(result['Value'][imageName])
                result = virtualMachineDB.getInstancesByStatus('Submitted')
                if result['OK'] and imageName in result['Value']:
                    instances += len(result['Value'][imageName])
                self.log.verbose('Checking Image %s:' % imageName, instances)
                maxInstances = imageDict['MaxInstances']
                if instances >= maxInstances:
                    self.log.info(
                        '%s >= %s Running instances of %s, skipping' %
                        (instances, maxInstances, imageName))
                    continue

                imageRequirementsDict = imageDict['RequirementsDict']
                result = taskQueueDB.getMatchingTaskQueues(
                    imageRequirementsDict)
                if not result['OK']:
                    self.log.error(
                        'Could not retrieve TaskQueues from TaskQueueDB',
                        result['Message'])
                    return result
                taskQueueDict = result['Value']
                jobs = 0
                priority = 0
                cpu = 0
                for tq in taskQueueDict:
                    jobs += taskQueueDict[tq]['Jobs']
                    priority += taskQueueDict[tq]['Priority']
                    cpu += taskQueueDict[tq]['Jobs'] * taskQueueDict[tq][
                        'CPUTime']

                if not jobs:
                    self.log.info('No matching jobs for %s found, skipping' %
                                  imageName)
                    continue

                if instances and (cpu /
                                  instances) < imageDict['CPUPerInstance']:
                    self.log.info(
                        'Waiting CPU per Running instance %s < %s, skipping' %
                        (cpu / instances, imageDict['CPUPerInstance']))
                    continue

                if directorName not in imagesToSubmit:
                    imagesToSubmit[directorName] = {}
                if imageName not in imagesToSubmit[directorName]:
                    imagesToSubmit[directorName][imageName] = {}
                imagesToSubmit[directorName][imageName] = {
                    'Jobs': jobs,
                    'TQPriority': priority,
                    'CPUTime': cpu,
                    'VMPriority': imageDict['Priority']
                }

        for directorName, imageDict in imagesToSubmit.items():
            for imageName, jobsDict in imageDict.items():
                if self.directors[directorName]['isEnabled']:
                    self.log.info('Requesting submission of %s to %s' %
                                  (imageName, directorName))

                    director = self.directors[directorName]['director']
                    pool = self.pools[self.directors[directorName]['pool']]

                    ret = pool.generateJobAndQueueIt(
                        director.submitInstance,
                        args=(imageName, self.workDir),
                        oCallback=self.callBack,
                        oExceptionCallback=director.exceptionCallBack,
                        blocking=False)

                    if not ret['OK']:
                        # Disable submission until next iteration
                        self.directors[directorName]['isEnabled'] = False
                    else:
                        time.sleep(self.am_getOption('ThreadStartDelay'))

        if 'Default' in self.pools:
            # only for those in "Default' thread Pool
            # for pool in self.pools:
            self.pools['Default'].processAllResults()

        return DIRAC.S_OK()