def execute( self ):
    """Main Agent code:
      1.- Query TaskQueueDB for existing TQs
      2.- Add their Priorities
      3.- Submit pilots
    """

    self.__checkSubmitPools()

    self.directorDict = getResourceDict()
    #Add all submit pools
    self.directorDict[ 'SubmitPool' ] = self.am_getOption( "SubmitPools" ) 
    #Add all DIRAC platforms if not specified otherwise
    if not 'Platform' in self.directorDict:
      result = gConfig.getOptionsDict( '/Resources/Computing/OSCompatibility' )
      if result['OK']:
        self.directorDict['Platform'] = result['Value'].keys()

    rpcMatcher = RPCClient( "WorkloadManagement/Matcher" )
    result = rpcMatcher.getMatchingTaskQueues( self.directorDict )
    if not result['OK']:
      self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] )
      return result
    taskQueueDict = result['Value']

    self.log.info( 'Found %s TaskQueues' % len( taskQueueDict ) )

    if not taskQueueDict:
      self.log.info( 'No TaskQueue to Process' )
      return S_OK()

    prioritySum = 0
    waitingJobs = 0
    for taskQueueID in taskQueueDict:
      taskQueueDict[taskQueueID]['TaskQueueID'] = taskQueueID
      prioritySum += taskQueueDict[taskQueueID]['Priority']
      waitingJobs += taskQueueDict[taskQueueID]['Jobs']

    self.log.info( 'Sum of Priorities %s' % prioritySum )

    if waitingJobs == 0:
      self.log.info( 'No waiting Jobs' )
      return S_OK( 'No waiting Jobs' )
    if prioritySum <= 0:
      return S_ERROR( 'Wrong TaskQueue Priorities' )

    self.pilotsPerPriority = self.am_getOption( 'pilotsPerIteration' ) / prioritySum
    self.pilotsPerJob = self.am_getOption( 'pilotsPerIteration' ) / waitingJobs

    self.callBackLock.acquire()
    self.submittedPilots = 0
    self.callBackLock.release()
    self.toSubmitPilots = 0
    waitingStatusList = ['Submitted', 'Ready', 'Scheduled', 'Waiting']
    timeLimitToConsider = Time.toString( Time.dateTime() - Time.hour * self.am_getOption( "maxPilotWaitingHours" ) )

    for taskQueueID in taskQueueDict:
      self.log.verbose( 'Processing TaskQueue', taskQueueID )

      result = pilotAgentsDB.countPilots( { 'TaskQueueID': taskQueueID,
                                            'Status': waitingStatusList},
                                          None, timeLimitToConsider )
      if not result['OK']:
        self.log.error( 'Fail to get Number of Waiting pilots', result['Message'] )
        waitingPilots = 0
      else:
        waitingPilots = result['Value']
        self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % taskQueueID, waitingPilots )

      result = self.submitPilotsForTaskQueue( taskQueueDict[taskQueueID], waitingPilots )

      if result['OK']:
        self.toSubmitPilots += result['Value']

    self.log.info( 'Number of pilots to be Submitted %s' % self.toSubmitPilots )

    # Now wait until all Jobs in the Default ThreadPool are proccessed
    if 'Default' in self.pools:
      # only for those in "Default' thread Pool
      # for pool in self.pools:
      self.pools['Default'].processAllResults()

    self.log.info( 'Number of pilots Submitted %s' % self.submittedPilots )

    return S_OK()
Beispiel #2
0
  def _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit,
                     ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ):
    """
      This method does the actual pilot submission to the DIRAC CE
      The logic is as follows:
      - If there are no available CE it return error
      - If there is no queue available in the CE's, it returns error
      - It creates a temp directory
      - It prepare a PilotScript
    """

    taskQueueID = taskQueueDict['TaskQueueID']
#     ownerDN = taskQueueDict['OwnerDN']

    submittedPilots = 0

    # if self.computingElement not in self.computingElementDict:
    #  # Since we can exclude CEs from the list, it may become empty
    #  return S_ERROR( ERROR_CE )

    pilotRequirements = []
    pilotRequirements.append( ( 'CPUTime', taskQueueDict['CPUTime'] ) )
    # do we need to care about anything else?
    pilotRequirementsString = str( pilotRequirements )

    # Check that there are available queues for the Jobs:
    if self.enableListMatch:
      availableQueues = []
      # now = Time.dateTime()
      cachedAvailableQueues = self.listMatchCache.get( pilotRequirementsString )
      if cachedAvailableQueues == False:
        availableQueues = self._listQueues( pilotRequirements )
        if availableQueues != False:
          self.listMatchCache.add( pilotRequirementsString, self.listMatchDelay, availableQueues )
          self.log.verbose( 'Available Queues for TaskQueue ',  "%s: %s" % ( taskQueueID, str(availableQueues) ) )
      else:
        availableQueues = cachedAvailableQueues

    if not availableQueues:
      return S_ERROR( ERROR_CE + ' TQ: %d' % taskQueueID )

    baseDir = os.getcwd()
    workingDirectory = tempfile.mkdtemp( prefix= 'TQ_%s_' % taskQueueID, dir = workDir )
    self.log.verbose( 'Using working Directory:', workingDirectory )
    os.chdir( workingDirectory )

    # set the Site Name
    pilotOptions.append( "-n '%s'" % self.siteName)

    # submit pilots for every CE available

    for CE in self.computingElementDict.keys():
      ceName = CE
      computingElement = self.computingElementDict[CE]['CE']

      # add possible requirements from Site and CE
      for req, val in getResourceDict( ceName ).items():
        pilotOptions.append( "-o '/AgentJobRequirements/%s=%s'" % ( req, val ) )

      ceConfigDict = self.computingElementDict[CE]

      if 'ClientPlatform' in ceConfigDict:
        pilotOptions.append( "-p '%s'" % ceConfigDict['ClientPlatform'])

      if 'SharedArea' in ceConfigDict:
        pilotOptions.append( "-o '/LocalSite/SharedArea=%s'" % ceConfigDict['SharedArea'] )

#       if 'CPUScalingFactor' in ceConfigDict:
#         pilotOptions.append( "-o '/LocalSite/CPUScalingFactor=%s'" % ceConfigDict['CPUScalingFactor'] )
#
#       if 'CPUNormalizationFactor' in ceConfigDict:
#         pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % ceConfigDict['CPUNormalizationFactor'] )

        self.log.info( "pilotOptions: ", ' '.join(pilotOptions))

      httpProxy = ''
      if 'HttpProxy' in ceConfigDict:
        httpProxy = ceConfigDict['HttpProxy']

      if 'JobExecDir' in ceConfigDict:
        pilotExecDir = ceConfigDict['JobExecDir']

      try:
        pilotScript = self._writePilotScript( workingDirectory, pilotOptions, proxy, httpProxy, pilotExecDir )
      except:
        self.log.exception( ERROR_SCRIPT )
        try:
          os.chdir( baseDir )
          shutil.rmtree( workingDirectory )
        except:
          pass
        return S_ERROR( ERROR_SCRIPT )

      self.log.info("Pilots to submit: ", pilotsToSubmit)
      while submittedPilots < pilotsToSubmit:
        # Find out how many pilots can be submitted
        ret = computingElement.available( )
        if not ret['OK']:
          self.log.error('Can not determine if pilot should be submitted: ', ret['Message'])
          break
        maxPilotsToSubmit = ret['Value']
        self.log.info("Submit Pilots: ", maxPilotsToSubmit)
        if not maxPilotsToSubmit:
          break
        # submit the pilots and then check again
        for _i in range( min( maxPilotsToSubmit, pilotsToSubmit - submittedPilots ) ):
          submission = computingElement.submitJob(pilotScript, '', '')
          if not submission['OK']:
            self.log.error('Pilot submission failed: ', submission['Message'])
            # cleanup
            try:
              os.chdir( baseDir )
              shutil.rmtree( workingDirectory )
            except:
              pass
            return S_ERROR('Pilot submission failed after ' + str(submittedPilots) + ' pilots submitted successful')
          submittedPilots += 1
          # let the batch system some time to digest the submitted job
          time.sleep(1)

      #next CE

    try:
      os.chdir( baseDir )
      shutil.rmtree( workingDirectory )
    except:
      pass

    return S_OK(submittedPilots)
Beispiel #3
0
    def execute(self):
        """Main Agent code:
      1.- Query TaskQueueDB for existing TQs
      2.- Add their Priorities
      3.- Submit pilots
    """

        self.__checkSubmitPools()

        self.directorDict = getResourceDict()
        # Add all submit pools
        self.directorDict["SubmitPool"] = self.am_getOption("SubmitPools")
        # Add all DIRAC platforms if not specified otherwise
        if not "Platform" in self.directorDict:
            result = getDIRACPlatforms()
            if result["OK"]:
                self.directorDict["Platform"] = result["Value"]

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(self.directorDict)
        if not result["OK"]:
            self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"])
            return result
        taskQueueDict = result["Value"]

        self.log.info("Found %s TaskQueues" % len(taskQueueDict))

        if not taskQueueDict:
            self.log.info("No TaskQueue to Process")
            return S_OK()

        prioritySum = 0
        waitingJobs = 0
        for taskQueueID in taskQueueDict:
            taskQueueDict[taskQueueID]["TaskQueueID"] = taskQueueID
            prioritySum += taskQueueDict[taskQueueID]["Priority"]
            waitingJobs += taskQueueDict[taskQueueID]["Jobs"]

        self.log.info("Sum of Priorities %s" % prioritySum)

        if waitingJobs == 0:
            self.log.info("No waiting Jobs")
            return S_OK("No waiting Jobs")
        if prioritySum <= 0:
            return S_ERROR("Wrong TaskQueue Priorities")

        self.pilotsPerPriority = self.am_getOption("pilotsPerIteration") / prioritySum
        self.pilotsPerJob = self.am_getOption("pilotsPerIteration") / waitingJobs

        self.callBackLock.acquire()
        self.submittedPilots = 0
        self.callBackLock.release()
        self.toSubmitPilots = 0
        waitingStatusList = ["Submitted", "Ready", "Scheduled", "Waiting"]
        timeLimitToConsider = Time.toString(Time.dateTime() - Time.hour * self.am_getOption("maxPilotWaitingHours"))

        for taskQueueID in taskQueueDict:
            self.log.verbose("Processing TaskQueue", taskQueueID)

            result = pilotAgentsDB.countPilots(
                {"TaskQueueID": taskQueueID, "Status": waitingStatusList}, None, timeLimitToConsider
            )
            if not result["OK"]:
                self.log.error("Fail to get Number of Waiting pilots", result["Message"])
                waitingPilots = 0
            else:
                waitingPilots = result["Value"]
                self.log.verbose("Waiting Pilots for TaskQueue %s:" % taskQueueID, waitingPilots)

            result = self.submitPilotsForTaskQueue(taskQueueDict[taskQueueID], waitingPilots)

            if result["OK"]:
                self.toSubmitPilots += result["Value"]

        self.log.info("Number of pilots to be Submitted %s" % self.toSubmitPilots)

        # Now wait until all Jobs in the Default ThreadPool are proccessed
        if "Default" in self.pools:
            # only for those in "Default' thread Pool
            # for pool in self.pools:
            self.pools["Default"].processAllResults()

        self.log.info("Number of pilots Submitted %s" % self.submittedPilots)

        return S_OK()
    def _submitPilots(self, workDir, taskQueueDict, pilotOptions,
                      pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ,
                      proxy, pilotsPerJob):
        """
      This method does the actual pilot submission to the DIRAC CE
      The logic is as follows:
      - If there are no available CE it return error
      - If there is no queue available in the CE's, it returns error
      - It creates a temp directory
      - It prepare a PilotScript
    """

        taskQueueID = taskQueueDict['TaskQueueID']
        ownerDN = taskQueueDict['OwnerDN']

        submittedPilots = 0

        # if self.computingElement not in self.computingElementDict:
        #  # Since we can exclude CEs from the list, it may become empty
        #  return S_ERROR( ERROR_CE )

        pilotRequirements = []
        pilotRequirements.append(('CPUTime', taskQueueDict['CPUTime']))
        # do we need to care about anything else?
        pilotRequirementsString = str(pilotRequirements)

        # Check that there are available queues for the Jobs:
        if self.enableListMatch:
            availableQueues = []
            # now = Time.dateTime()
            cachedAvailableQueues = self.listMatchCache.get(
                pilotRequirementsString)
            if cachedAvailableQueues == False:
                availableQueues = self._listQueues(pilotRequirements)
                if availableQueues != False:
                    self.listMatchCache.add(pilotRequirementsString,
                                            self.listMatchDelay,
                                            availableQueues)
                    self.log.verbose(
                        'Available Queues for TaskQueue ',
                        "%s: %s" % (taskQueueID, str(availableQueues)))
            else:
                availableQueues = cachedAvailableQueues

        if not availableQueues:
            return S_ERROR(ERROR_CE + ' TQ: %d' % taskQueueID)

        baseDir = os.getcwd()
        workingDirectory = tempfile.mkdtemp(prefix='TQ_%s_' % taskQueueID,
                                            dir=workDir)
        self.log.verbose('Using working Directory:', workingDirectory)
        os.chdir(workingDirectory)

        # set the Site Name
        pilotOptions.append("-n '%s'" % self.siteName)

        # submit pilots for every CE available

        for CE in self.computingElementDict.keys():
            ceName = CE
            computingElement = self.computingElementDict[CE]['CE']

            # add possible requirements from Site and CE
            for req, val in getResourceDict(ceName).items():
                pilotOptions.append("-o '/AgentJobRequirements/%s=%s'" %
                                    (req, val))

            ceConfigDict = self.computingElementDict[CE]

            if 'ClientPlatform' in ceConfigDict:
                pilotOptions.append("-p '%s'" % ceConfigDict['ClientPlatform'])

            if 'SharedArea' in ceConfigDict:
                pilotOptions.append("-o '/LocalSite/SharedArea=%s'" %
                                    ceConfigDict['SharedArea'])

            if 'CPUScalingFactor' in ceConfigDict:
                pilotOptions.append("-o '/LocalSite/CPUScalingFactor=%s'" %
                                    ceConfigDict['CPUScalingFactor'])

            if 'CPUNormalizationFactor' in ceConfigDict:
                pilotOptions.append(
                    "-o '/LocalSite/CPUNormalizationFactor=%s'" %
                    ceConfigDict['CPUNormalizationFactor'])

                self.log.info("pilotOptions: ", ' '.join(pilotOptions))

            httpProxy = ''
            if 'HttpProxy' in ceConfigDict:
                httpProxy = ceConfigDict['HttpProxy']

            pilotDir = ''
            if 'JobExecDir' in ceConfigDict:
                pilotExecDir = ceConfigDict['JobExecDir']

            try:
                pilotScript = self._writePilotScript(workingDirectory,
                                                     pilotOptions, proxy,
                                                     httpProxy, pilotExecDir)
            except:
                self.log.exception(ERROR_SCRIPT)
                try:
                    os.chdir(baseDir)
                    shutil.rmtree(workingDirectory)
                except:
                    pass
                return S_ERROR(ERROR_SCRIPT)

            self.log.info("Pilots to submit: ", pilotsToSubmit)
            while submittedPilots < pilotsToSubmit:
                # Find out how many pilots can be submitted
                ret = computingElement.available()
                if not ret['OK']:
                    self.log.error(
                        'Can not determine if pilot should be submitted: ',
                        ret['Message'])
                    break
                maxPilotsToSubmit = ret['Value']
                self.log.info("Submit Pilots: ", maxPilotsToSubmit)
                if not maxPilotsToSubmit:
                    break
                # submit the pilots and then check again
                for i in range(
                        min(maxPilotsToSubmit,
                            pilotsToSubmit - submittedPilots)):
                    submission = computingElement.submitJob(
                        pilotScript, '', '')
                    if not submission['OK']:
                        self.log.error('Pilot submission failed: ',
                                       submission['Message'])
                        # cleanup
                        try:
                            os.chdir(baseDir)
                            shutil.rmtree(workingDirectory)
                        except:
                            pass
                        return S_ERROR('Pilot submission failed after ' +
                                       str(submittedPilots) +
                                       ' pilots submitted successful')
                    submittedPilots += 1
                    # let the batch system some time to digest the submitted job
                    time.sleep(1)

            #next CE

        try:
            os.chdir(baseDir)
            shutil.rmtree(workingDirectory)
        except:
            pass

        return S_OK(submittedPilots)
    def execute(self):
        """Main Agent code:
      1.- Query TaskQueueDB for existing TQs
      2.- Add their Priorities
      3.- Submit pilots
    """

        self.__checkSubmitPools()

        self.directorDict = getResourceDict()
        #Add all submit pools
        self.directorDict['SubmitPool'] = self.am_getOption("SubmitPools")

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(self.directorDict)
        if not result['OK']:
            self.log.error('Could not retrieve TaskQueues from TaskQueueDB',
                           result['Message'])
            return result
        taskQueueDict = result['Value']

        self.log.info('Found %s TaskQueues' % len(taskQueueDict))

        if not taskQueueDict:
            self.log.info('No TaskQueue to Process')
            return S_OK()

        prioritySum = 0
        waitingJobs = 0
        for taskQueueID in taskQueueDict:
            taskQueueDict[taskQueueID]['TaskQueueID'] = taskQueueID
            prioritySum += taskQueueDict[taskQueueID]['Priority']
            waitingJobs += taskQueueDict[taskQueueID]['Jobs']

        self.log.info('Sum of Priorities %s' % prioritySum)

        if waitingJobs == 0:
            self.log.info('No waiting Jobs')
            return S_OK('No waiting Jobs')
        if prioritySum <= 0:
            return S_ERROR('Wrong TaskQueue Priorities')

        self.pilotsPerPriority = self.am_getOption(
            'pilotsPerIteration') / prioritySum
        self.pilotsPerJob = self.am_getOption(
            'pilotsPerIteration') / waitingJobs

        self.callBackLock.acquire()
        self.submittedPilots = 0
        self.callBackLock.release()
        self.toSubmitPilots = 0
        waitingStatusList = ['Submitted', 'Ready', 'Scheduled', 'Waiting']
        timeLimitToConsider = Time.toString(
            Time.dateTime() -
            Time.hour * self.am_getOption("maxPilotWaitingHours"))

        for taskQueueID in taskQueueDict:
            self.log.verbose('Processing TaskQueue', taskQueueID)

            result = pilotAgentsDB.countPilots(
                {
                    'TaskQueueID': taskQueueID,
                    'Status': waitingStatusList
                }, None, timeLimitToConsider)
            if not result['OK']:
                self.log.error('Fail to get Number of Waiting pilots',
                               result['Message'])
                waitingPilots = 0
            else:
                waitingPilots = result['Value']
                self.log.verbose(
                    'Waiting Pilots for TaskQueue %s:' % taskQueueID,
                    waitingPilots)

            result = self.submitPilotsForTaskQueue(taskQueueDict[taskQueueID],
                                                   waitingPilots)

            if result['OK']:
                self.toSubmitPilots += result['Value']

        self.log.info('Number of pilots to be Submitted %s' %
                      self.toSubmitPilots)

        # Now wait until all Jobs in the Default ThreadPool are proccessed
        if 'Default' in self.pools:
            # only for those in "Default' thread Pool
            # for pool in self.pools:
            self.pools['Default'].processAllResults()

        self.log.info('Number of pilots Submitted %s' % self.submittedPilots)

        return S_OK()