Ejemplo n.º 1
0
  def getQueueSlots( self, queue, manyWaitingPilotsFlag ):
    """ Get the number of available slots in the queue
    """
    ce = self.queueDict[queue]['CE']
    ceName = self.queueDict[queue]['CEName']
    queueName = self.queueDict[queue]['QueueName']

    self.queueSlots.setdefault( queue, {} )
    totalSlots = self.queueSlots[queue].get( 'AvailableSlots', 0 )

    # See if there are waiting pilots for this queue. If not, allow submission
    if totalSlots and manyWaitingPilotsFlag:
      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                            'Queue':queueName,
                                            'Status': WAITING_PILOT_STATUS } )
      if result['OK']:
        jobIDList = result['Value']
        if not jobIDList:
          return totalSlots
      return 0

    availableSlotsCount = self.queueSlots[queue].setdefault( 'AvailableSlotsCount', 0 )
    waitingJobs = 1
    if totalSlots == 0:
      if availableSlotsCount % 10 == 0:

        # Get the list of already existing pilots for this queue
        jobIDList = None
        result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                              'Queue':queueName,
                                              'Status': TRANSIENT_PILOT_STATUS } )

        if result['OK']:
          jobIDList = result['Value']

        result = ce.available( jobIDList )
        if not result['OK']:
          self.log.warn( 'Failed to check the availability of queue %s: \n%s' % ( queue, result['Message'] ) )
          self.failedQueues[queue] += 1
        else:
          ceInfoDict = result['CEInfoDict']
          self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \
                         ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'],
                           ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) )
          totalSlots = result['Value']
          self.queueSlots[queue]['AvailableSlots'] = totalSlots
          waitingJobs = ceInfoDict['WaitingJobs']

    self.queueSlots[queue]['AvailableSlotsCount'] += 1

    if manyWaitingPilotsFlag and waitingJobs:
      return 0
    else:
      return totalSlots
Ejemplo n.º 2
0
  def getQueueSlots( self, queue, manyWaitingPilotsFlag ):
    """ Get the number of available slots in the queue
    """
    ce = self.queueDict[queue]['CE']
    ceName = self.queueDict[queue]['CEName']
    queueName = self.queueDict[queue]['QueueName']

    self.queueSlots.setdefault( queue, {} )
    totalSlots = self.queueSlots[queue].get( 'AvailableSlots', 0 )

    # See if there are waiting pilots for this queue. If not, allow submission
    if totalSlots and manyWaitingPilotsFlag:
      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                            'Queue':queueName,
                                            'Status': WAITING_PILOT_STATUS } )
      if result['OK']:
        jobIDList = result['Value']
        if not jobIDList:
          return totalSlots
      return 0

    availableSlotsCount = self.queueSlots[queue].setdefault( 'AvailableSlotsCount', 0 )
    waitingJobs = 1
    if totalSlots == 0:
      if availableSlotsCount % 10 == 0:

        # Get the list of already existing pilots for this queue
        jobIDList = None
        result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                              'Queue':queueName,
                                              'Status': TRANSIENT_PILOT_STATUS } )

        if result['OK']:
          jobIDList = result['Value']

        result = ce.available( jobIDList )
        if not result['OK']:
          self.log.warn( 'Failed to check the availability of queue %s: \n%s' % ( queue, result['Message'] ) )
          self.failedQueues[queue] += 1
        else:
          ceInfoDict = result['CEInfoDict']
          self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \
                         ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'],
                           ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) )
          totalSlots = result['Value']
          self.queueSlots[queue]['AvailableSlots'] = totalSlots
          waitingJobs = ceInfoDict['WaitingJobs']

    self.queueSlots[queue]['AvailableSlotsCount'] += 1

    if manyWaitingPilotsFlag and waitingJobs:
      return 0
    else:
      return totalSlots
Ejemplo n.º 3
0
  def __getQueueSlots( self, queue ):
    """ Get the number of available slots in the queue
    """
    ce = self.queueDict[queue]['CE']
    ceName = self.queueDict[queue]['CEName']
    queueName = self.queueDict[queue]['QueueName']

    self.queueSlots.setdefault( queue, {} )
    totalSlots = self.queueSlots[queue].get( 'AvailableSlots', 0 )
    availableSlotsCount = self.queueSlots[queue].setdefault( 'AvailableSlotsCount', 0 )
    if totalSlots == 0:
      if availableSlotsCount % 10 == 0:
        
        # Get the list of already existing pilots for this queue
        jobIDList = None
        result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                              'Queue':queueName,
                                              'Status':['Running','Submitted','Scheduled'] } )
        if result['OK']:
          jobIDList = result['Value']
          
        result = ce.available( jobIDList )
        if not result['OK']:
          self.log.warn( 'Failed to check the availability of queue %s: \n%s' % ( queue, result['Message'] ) )
          self.failedQueues[queue] += 1
        else:
          ceInfoDict = result['CEInfoDict']
          self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \
                         ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'],
                           ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) )
          totalSlots = result['Value']
          self.queueSlots[queue]['AvailableSlots'] = totalSlots

    self.queueSlots[queue]['AvailableSlotsCount'] += 1
    return totalSlots
Ejemplo n.º 4
0
  def updatePilotStatus( self ):
    """ Update status of pilots in transient states
    """
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      abortedPilots = 0

      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                            'Queue':queueName,
                                            'GridType':ceType,
                                            'GridSite':siteName,
                                            'Status':TRANSIENT_PILOT_STATUS,
                                            'OwnerDN': self.pilotDN,
                                            'OwnerGroup': self.pilotGroup } )
      if not result['OK']:
        self.log.error( 'Failed to select pilots: %s' % result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue

      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']

      stampedPilotRefs = []
      for pRef in pilotDict:
        if pilotDict[pRef]['PilotStamp']:
          stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] )
        else:
          stampedPilotRefs = list( pilotRefs )
          break

      result = ce.isProxyValid()
      if not result['OK']:
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 23400 )
        if not result['OK']:
          return result
        self.proxy = result['Value']
        ce.setProxy( self.proxy, 23300 )

      result = ce.getJobStatus( stampedPilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) )
        continue
      pilotCEDict = result['Value']

      for pRef in pilotRefs:
        newStatus = ''
        oldStatus = pilotDict[pRef]['Status']
        ceStatus = pilotCEDict[pRef]
        lastUpdateTime = pilotDict[pRef]['LastUpdateTime']
        sinceLastUpdate = dateTime() - lastUpdateTime

        if oldStatus == ceStatus and ceStatus != "Unknown":
          # Normal status did not change, continue
          continue
        elif ceStatus == "Unknown" and oldStatus == "Unknown":
          if sinceLastUpdate < 3600*second:
            # Allow 1 hour of Unknown status assuming temporary problems on the CE
            continue
          else:
            newStatus = 'Aborted'
        elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
          # Possible problems on the CE, let's keep the Unknown status for a while
          newStatus = 'Unknown'
        elif ceStatus != 'Unknown' :
          # Update the pilot status to the new value
          newStatus = ceStatus

        if newStatus:
          self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) )
          result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' )
          if newStatus == "Aborted":
            abortedPilots += 1
        # Retrieve the pilot output now
        if newStatus in FINAL_PILOT_STATUS:
          if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput:
            self.log.info( 'Retrieving output for pilot %s' % pRef )
            pilotStamp = pilotDict[pRef]['PilotStamp']
            pRefStamp = pRef
            if pilotStamp:
              pRefStamp = pRef + ':::' + pilotStamp
            result = ce.getJobOutput( pRefStamp )
            if not result['OK']:
              self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
            else:
              output, error = result['Value']
              if output:
                result = pilotAgentsDB.storePilotOutput( pRef, output, error )
                if not result['OK']:
                  self.log.error( 'Failed to store pilot output', result['Message'] )
              else:
                self.log.warn( 'Empty pilot output not stored to PilotDB' )

      # If something wrong in the queue, make a pause for the job submission
      if abortedPilots:
        self.failedQueues[queue] += 1 

    # The pilot can be in Done state set by the job agent check if the output is retrieved
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']

      if not ce.isProxyValid( 120 ):
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 )
        if not result['OK']:
          return result
        ce.setProxy( self.proxy, 940 )

      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                           'Queue':queueName,
                                           'GridType':ceType,
                                           'GridSite':siteName,
                                           'OutputReady':'False',
                                           'Status':FINAL_PILOT_STATUS} )

      if not result['OK']:
        self.log.error( 'Failed to select pilots', result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue
      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']
      if self.getOutput:
        for pRef in pilotRefs:
          self.log.info( 'Retrieving output for pilot %s' % pRef )
          pilotStamp = pilotDict[pRef]['PilotStamp']
          pRefStamp = pRef
          if pilotStamp:
            pRefStamp = pRef + ':::' + pilotStamp
          result = ce.getJobOutput( pRefStamp )
          if not result['OK']:
            self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
          else:
            output, error = result['Value']
            result = pilotAgentsDB.storePilotOutput( pRef, output, error )
            if not result['OK']:
              self.log.error( 'Failed to store pilot output', result['Message'] )

      # Check if the accounting is to be sent
      if self.sendAccounting:
        result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                             'Queue':queueName,
                                             'GridType':ceType,
                                             'GridSite':siteName,
                                             'AccountingSent':'False',
                                             'Status':FINAL_PILOT_STATUS} )

        if not result['OK']:
          self.log.error( 'Failed to select pilots', result['Message'] )
          continue
        pilotRefs = result['Value']
        if not pilotRefs:
          continue
        result = pilotAgentsDB.getPilotInfo( pilotRefs )
        if not result['OK']:
          self.log.error( 'Failed to get pilots info from DB', result['Message'] )
          continue
        pilotDict = result['Value']
        result = self.sendPilotAccounting( pilotDict )
        if not result['OK']:
          self.log.error( 'Failed to send pilot agent accounting' )

    return S_OK()
Ejemplo n.º 5
0
    def updatePilotStatus(self):
        """ Update status of pilots in transient states
    """
        for queue in self.queueDict:
            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            queueName = self.queueDict[queue]['QueueName']
            ceType = self.queueDict[queue]['CEType']
            siteName = self.queueDict[queue]['Site']

            result = pilotAgentsDB.selectPilots({
                'DestinationSite': ceName,
                'Queue': queueName,
                'GridType': ceType,
                'GridSite': siteName,
                'Status': TRANSIENT_PILOT_STATUS,
                'OwnerDN': self.pilotDN,
                'OwnerGroup': self.pilotGroup
            })
            if not result['OK']:
                self.log.error('Failed to select pilots: %s' %
                               result['Message'])
                continue
            pilotRefs = result['Value']
            if not pilotRefs:
                continue

            #print "AT >>> pilotRefs", pilotRefs

            result = pilotAgentsDB.getPilotInfo(pilotRefs)
            if not result['OK']:
                self.log.error('Failed to get pilots info from DB',
                               result['Message'])
                continue
            pilotDict = result['Value']

            #print "AT >>> pilotDict", pilotDict

            stampedPilotRefs = []
            for pRef in pilotDict:
                if pilotDict[pRef]['PilotStamp']:
                    stampedPilotRefs.append(pRef + ":::" +
                                            pilotDict[pRef]['PilotStamp'])
                else:
                    stampedPilotRefs = list(pilotRefs)
                    break

            result = ce.isProxyValid()
            if not result['OK']:
                result = gProxyManager.getPilotProxyFromDIRACGroup(
                    self.pilotDN, self.pilotGroup, 600)
                if not result['OK']:
                    return result
                self.proxy = result['Value']
                ce.setProxy(self.proxy, 500)

            result = ce.getJobStatus(stampedPilotRefs)
            if not result['OK']:
                self.log.error('Failed to get pilots status from CE',
                               '%s: %s' % (ceName, result['Message']))
                continue
            pilotCEDict = result['Value']

            #print "AT >>> pilotCEDict", pilotCEDict

            for pRef in pilotRefs:
                newStatus = ''
                oldStatus = pilotDict[pRef]['Status']
                ceStatus = pilotCEDict[pRef]
                if oldStatus == ceStatus:
                    # Status did not change, continue
                    continue
                elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
                    # Pilot finished without reporting, consider it Aborted
                    newStatus = 'Aborted'
                elif ceStatus != 'Unknown':
                    # Update the pilot status to the new value
                    newStatus = ceStatus

                if newStatus:
                    self.log.info('Updating status to %s for pilot %s' %
                                  (newStatus, pRef))
                    result = pilotAgentsDB.setPilotStatus(
                        pRef, newStatus, '', 'Updated by SiteDirector')
                # Retrieve the pilot output now
                if newStatus in FINAL_PILOT_STATUS:
                    if pilotDict[pRef]['OutputReady'].lower(
                    ) == 'false' and self.getOutput:
                        self.log.info('Retrieving output for pilot %s' % pRef)
                        pilotStamp = pilotDict[pRef]['PilotStamp']
                        pRefStamp = pRef
                        if pilotStamp:
                            pRefStamp = pRef + ':::' + pilotStamp
                        result = ce.getJobOutput(pRefStamp)
                        if not result['OK']:
                            self.log.error(
                                'Failed to get pilot output',
                                '%s: %s' % (ceName, result['Message']))
                        else:
                            output, error = result['Value']
                            if output:
                                result = pilotAgentsDB.storePilotOutput(
                                    pRef, output, error)
                                if not result['OK']:
                                    self.log.error(
                                        'Failed to store pilot output',
                                        result['Message'])
                            else:
                                self.log.warn(
                                    'Empty pilot output not stored to PilotDB')

        # The pilot can be in Done state set by the job agent check if the output is retrieved
        for queue in self.queueDict:
            ce = self.queueDict[queue]['CE']

            if not ce.isProxyValid(120):
                result = gProxyManager.getPilotProxyFromDIRACGroup(
                    self.pilotDN, self.pilotGroup, 1000)
                if not result['OK']:
                    return result
                ce.setProxy(self.proxy, 940)

            ceName = self.queueDict[queue]['CEName']
            queueName = self.queueDict[queue]['QueueName']
            ceType = self.queueDict[queue]['CEType']
            siteName = self.queueDict[queue]['Site']
            result = pilotAgentsDB.selectPilots({
                'DestinationSite': ceName,
                'Queue': queueName,
                'GridType': ceType,
                'GridSite': siteName,
                'OutputReady': 'False',
                'Status': FINAL_PILOT_STATUS
            })

            if not result['OK']:
                self.log.error('Failed to select pilots', result['Message'])
                continue
            pilotRefs = result['Value']
            if not pilotRefs:
                continue
            result = pilotAgentsDB.getPilotInfo(pilotRefs)
            if not result['OK']:
                self.log.error('Failed to get pilots info from DB',
                               result['Message'])
                continue
            pilotDict = result['Value']
            if self.getOutput:
                for pRef in pilotRefs:
                    self.log.info('Retrieving output for pilot %s' % pRef)
                    pilotStamp = pilotDict[pRef]['PilotStamp']
                    pRefStamp = pRef
                    if pilotStamp:
                        pRefStamp = pRef + ':::' + pilotStamp
                    result = ce.getJobOutput(pRefStamp)
                    if not result['OK']:
                        self.log.error('Failed to get pilot output',
                                       '%s: %s' % (ceName, result['Message']))
                    else:
                        output, error = result['Value']
                        result = pilotAgentsDB.storePilotOutput(
                            pRef, output, error)
                        if not result['OK']:
                            self.log.error('Failed to store pilot output',
                                           result['Message'])

            # Check if the accounting is to be sent
            if self.sendAccounting:
                result = pilotAgentsDB.selectPilots({
                    'DestinationSite':
                    ceName,
                    'Queue':
                    queueName,
                    'GridType':
                    ceType,
                    'GridSite':
                    siteName,
                    'AccountingSent':
                    'False',
                    'Status':
                    FINAL_PILOT_STATUS
                })

                if not result['OK']:
                    self.log.error('Failed to select pilots',
                                   result['Message'])
                    continue
                pilotRefs = result['Value']
                if not pilotRefs:
                    continue
                result = pilotAgentsDB.getPilotInfo(pilotRefs)
                if not result['OK']:
                    self.log.error('Failed to get pilots info from DB',
                                   result['Message'])
                    continue
                pilotDict = result['Value']
                result = self.sendPilotAccounting(pilotDict)
                if not result['OK']:
                    self.log.error('Failed to send pilot agent accounting')

        return S_OK()
Ejemplo n.º 6
0
  def updatePilotStatus( self ):
    """ Update status of pilots in transient states
    """
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']

      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                           'Queue':queueName,
                                           'GridType':ceType,
                                           'GridSite':siteName,
                                           'Status':TRANSIENT_PILOT_STATUS,
                                           'OwnerDN': self.pilotDN,
                                           'OwnerGroup': self.pilotGroup } )
      if not result['OK']:
        self.log.error( 'Failed to select pilots: %s' % result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue

      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']
      stampedPilotRefs = []
      for pRef in pilotDict:
        if pilotDict[pRef]['PilotStamp']:
          stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] )
        else:
          stampedPilotRefs = list( pilotRefs )
          break

      result = ce.isProxyValid()
      if not result['OK']:
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 600 )
        if not result['OK']:
          return result
        self.proxy = result['Value']
        ce.setProxy( self.proxy, 500 )

      result = ce.getJobStatus( stampedPilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) )
        continue
      pilotCEDict = result['Value']

      for pRef in pilotRefs:
        newStatus = ''
        oldStatus = pilotDict[pRef]['Status']
        ceStatus = pilotCEDict[pRef]
        if oldStatus == ceStatus:
          # Status did not change, continue
          continue
        elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
          # Pilot finished without reporting, consider it Aborted
          newStatus = 'Aborted'
        elif ceStatus != 'Unknown' :
          # Update the pilot status to the new value
          newStatus = ceStatus

        if newStatus:
          self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) )
          result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' )
        # Retrieve the pilot output now
        if newStatus in FINAL_PILOT_STATUS:
          if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput:
            self.log.info( 'Retrieving output for pilot %s' % pRef )
            pilotStamp = pilotDict[pRef]['PilotStamp']
            pRefStamp = pRef
            if pilotStamp:
              pRefStamp = pRef + ':::' + pilotStamp
            result = ce.getJobOutput( pRefStamp )
            if not result['OK']:
              self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
            else:
              output, error = result['Value']
              if output:
                result = pilotAgentsDB.storePilotOutput( pRef, output, error )
                if not result['OK']:
                  self.log.error( 'Failed to store pilot output', result['Message'] )
              else:
                self.log.warn( 'Empty pilot output not stored to PilotDB' )

    # The pilot can be in Done state set by the job agent check if the output is retrieved
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']

      if not ce.isProxyValid( 120 ):
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 )
        if not result['OK']:
          return result
        ce.setProxy( self.proxy, 940 )

      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                           'Queue':queueName,
                                           'GridType':ceType,
                                           'GridSite':siteName,
                                           'OutputReady':'False',
                                           'Status':FINAL_PILOT_STATUS} )

      if not result['OK']:
        self.log.error( 'Failed to select pilots', result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue
      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']
      if self.getOutput:
        for pRef in pilotRefs:
          self.log.info( 'Retrieving output for pilot %s' % pRef )
          pilotStamp = pilotDict[pRef]['PilotStamp']
          pRefStamp = pRef
          if pilotStamp:
            pRefStamp = pRef + ':::' + pilotStamp
          result = ce.getJobOutput( pRefStamp )
          if not result['OK']:
            self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
          else:
            output, error = result['Value']
            result = pilotAgentsDB.storePilotOutput( pRef, output, error )
            if not result['OK']:
              self.log.error( 'Failed to store pilot output', result['Message'] )

      # Check if the accounting is to be sent
      if self.sendAccounting:
        result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                             'Queue':queueName,
                                             'GridType':ceType,
                                             'GridSite':siteName,
                                             'AccountingSent':'False',
                                             'Status':FINAL_PILOT_STATUS} )

        if not result['OK']:
          self.log.error( 'Failed to select pilots', result['Message'] )
          continue
        pilotRefs = result['Value']
        if not pilotRefs:
          continue
        result = pilotAgentsDB.getPilotInfo( pilotRefs )
        if not result['OK']:
          self.log.error( 'Failed to get pilots info from DB', result['Message'] )
          continue
        pilotDict = result['Value']
        result = self.sendPilotAccounting( pilotDict )
        if not result['OK']:
          self.log.error( 'Failed to send pilot agent accounting' )

    return S_OK()
Ejemplo n.º 7
0
  def updatePilotStatus( self ):
    """ Update status of pilots in transient states
    """
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      abortedPilots = 0

      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                            'Queue':queueName,
                                            'GridType':ceType,
                                            'GridSite':siteName,
                                            'Status':TRANSIENT_PILOT_STATUS,
                                            'OwnerDN': self.pilotDN,
                                            'OwnerGroup': self.pilotGroup } )
      if not result['OK']:
        self.log.error( 'Failed to select pilots: %s' % result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue

      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']

      stampedPilotRefs = []
      for pRef in pilotDict:
        if pilotDict[pRef]['PilotStamp']:
          stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] )
        else:
          stampedPilotRefs = list( pilotRefs )
          break

      # This proxy is used for checking the pilot status and renewals
      # We really need at least a few hours otherwise the renewed
      # proxy may expire before we check again...
      result = ce.isProxyValid( 3*3600 )
      if not result['OK']:
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 23400 )
        if not result['OK']:
          return result
        self.proxy = result['Value']
        ce.setProxy( self.proxy, 23300 )

      result = ce.getJobStatus( stampedPilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) )
        continue
      pilotCEDict = result['Value']

      for pRef in pilotRefs:
        newStatus = ''
        oldStatus = pilotDict[pRef]['Status']
        ceStatus = pilotCEDict[pRef]
        lastUpdateTime = pilotDict[pRef]['LastUpdateTime']
        sinceLastUpdate = dateTime() - lastUpdateTime

        if oldStatus == ceStatus and ceStatus != "Unknown":
          # Normal status did not change, continue
          continue
        elif ceStatus == "Unknown" and oldStatus == "Unknown":
          if sinceLastUpdate < 3600*second:
            # Allow 1 hour of Unknown status assuming temporary problems on the CE
            continue
          else:
            newStatus = 'Aborted'
        elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
          # Possible problems on the CE, let's keep the Unknown status for a while
          newStatus = 'Unknown'
        elif ceStatus != 'Unknown' :
          # Update the pilot status to the new value
          newStatus = ceStatus

        if newStatus:
          self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) )
          result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' )
          if newStatus == "Aborted":
            abortedPilots += 1
        # Retrieve the pilot output now
        if newStatus in FINAL_PILOT_STATUS:
          if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput:
            self.log.info( 'Retrieving output for pilot %s' % pRef )
            pilotStamp = pilotDict[pRef]['PilotStamp']
            pRefStamp = pRef
            if pilotStamp:
              pRefStamp = pRef + ':::' + pilotStamp
            result = ce.getJobOutput( pRefStamp )
            if not result['OK']:
              self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
            else:
              output, error = result['Value']
              if output:
                result = pilotAgentsDB.storePilotOutput( pRef, output, error )
                if not result['OK']:
                  self.log.error( 'Failed to store pilot output', result['Message'] )
              else:
                self.log.warn( 'Empty pilot output not stored to PilotDB' )

      # If something wrong in the queue, make a pause for the job submission
      if abortedPilots:
        self.failedQueues[queue] += 1

    # The pilot can be in Done state set by the job agent check if the output is retrieved
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']

      if not ce.isProxyValid( 120 ):
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 )
        if not result['OK']:
          return result
        ce.setProxy( self.proxy, 940 )

      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                            'Queue':queueName,
                                            'GridType':ceType,
                                            'GridSite':siteName,
                                            'OutputReady':'False',
                                            'Status':FINAL_PILOT_STATUS} )

      if not result['OK']:
        self.log.error( 'Failed to select pilots', result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue
      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']
      if self.getOutput:
        for pRef in pilotRefs:
          self.log.info( 'Retrieving output for pilot %s' % pRef )
          pilotStamp = pilotDict[pRef]['PilotStamp']
          pRefStamp = pRef
          if pilotStamp:
            pRefStamp = pRef + ':::' + pilotStamp
          result = ce.getJobOutput( pRefStamp )
          if not result['OK']:
            self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
          else:
            output, error = result['Value']
            result = pilotAgentsDB.storePilotOutput( pRef, output, error )
            if not result['OK']:
              self.log.error( 'Failed to store pilot output', result['Message'] )

      # Check if the accounting is to be sent
      if self.sendAccounting:
        result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                              'Queue':queueName,
                                              'GridType':ceType,
                                              'GridSite':siteName,
                                              'AccountingSent':'False',
                                              'Status':FINAL_PILOT_STATUS} )

        if not result['OK']:
          self.log.error( 'Failed to select pilots', result['Message'] )
          continue
        pilotRefs = result['Value']
        if not pilotRefs:
          continue
        result = pilotAgentsDB.getPilotInfo( pilotRefs )
        if not result['OK']:
          self.log.error( 'Failed to get pilots info from DB', result['Message'] )
          continue
        pilotDict = result['Value']
        result = self.sendPilotAccounting( pilotDict )
        if not result['OK']:
          self.log.error( 'Failed to send pilot agent accounting' )

    return S_OK()
Ejemplo n.º 8
0
    def updatePilotStatus(self):
        """ Update status of pilots in transient states
    """
        for queue in self.queueDict:
            ce = self.queueDict[queue]["CE"]
            ceName = self.queueDict[queue]["CEName"]
            queueName = self.queueDict[queue]["QueueName"]
            ceType = self.queueDict[queue]["CEType"]
            siteName = self.queueDict[queue]["Site"]

            result = pilotAgentsDB.selectPilots(
                {
                    "DestinationSite": ceName,
                    "Queue": queueName,
                    "GridType": ceType,
                    "GridSite": siteName,
                    "Status": TRANSIENT_PILOT_STATUS,
                    "OwnerDN": self.pilotDN,
                    "OwnerGroup": self.pilotGroup,
                }
            )
            if not result["OK"]:
                self.log.error("Failed to select pilots: %s" % result["Message"])
                continue
            pilotRefs = result["Value"]
            if not pilotRefs:
                continue

            # print "AT >>> pilotRefs", pilotRefs

            result = pilotAgentsDB.getPilotInfo(pilotRefs)
            if not result["OK"]:
                self.log.error("Failed to get pilots info from DB", result["Message"])
                continue
            pilotDict = result["Value"]

            # print "AT >>> pilotDict", pilotDict

            stampedPilotRefs = []
            for pRef in pilotDict:
                if pilotDict[pRef]["PilotStamp"]:
                    stampedPilotRefs.append(pRef + ":::" + pilotDict[pRef]["PilotStamp"])
                else:
                    stampedPilotRefs = list(pilotRefs)
                    break

            result = ce.isProxyValid()
            if not result["OK"]:
                result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, 600)
                if not result["OK"]:
                    return result
                self.proxy = result["Value"]
                ce.setProxy(self.proxy, 500)

            result = ce.getJobStatus(stampedPilotRefs)
            if not result["OK"]:
                self.log.error("Failed to get pilots status from CE", "%s: %s" % (ceName, result["Message"]))
                continue
            pilotCEDict = result["Value"]

            # print "AT >>> pilotCEDict", pilotCEDict

            for pRef in pilotRefs:
                newStatus = ""
                oldStatus = pilotDict[pRef]["Status"]
                ceStatus = pilotCEDict[pRef]
                if oldStatus == ceStatus:
                    # Status did not change, continue
                    continue
                elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
                    # Pilot finished without reporting, consider it Aborted
                    newStatus = "Aborted"
                elif ceStatus != "Unknown":
                    # Update the pilot status to the new value
                    newStatus = ceStatus

                if newStatus:
                    self.log.info("Updating status to %s for pilot %s" % (newStatus, pRef))
                    result = pilotAgentsDB.setPilotStatus(pRef, newStatus, "", "Updated by SiteDirector")
                # Retrieve the pilot output now
                if newStatus in FINAL_PILOT_STATUS:
                    if pilotDict[pRef]["OutputReady"].lower() == "false" and self.getOutput:
                        self.log.info("Retrieving output for pilot %s" % pRef)
                        pilotStamp = pilotDict[pRef]["PilotStamp"]
                        pRefStamp = pRef
                        if pilotStamp:
                            pRefStamp = pRef + ":::" + pilotStamp
                        result = ce.getJobOutput(pRefStamp)
                        if not result["OK"]:
                            self.log.error("Failed to get pilot output", "%s: %s" % (ceName, result["Message"]))
                        else:
                            output, error = result["Value"]
                            if output:
                                result = pilotAgentsDB.storePilotOutput(pRef, output, error)
                                if not result["OK"]:
                                    self.log.error("Failed to store pilot output", result["Message"])
                            else:
                                self.log.warn("Empty pilot output not stored to PilotDB")

        # The pilot can be in Done state set by the job agent check if the output is retrieved
        for queue in self.queueDict:
            ce = self.queueDict[queue]["CE"]

            if not ce.isProxyValid(120):
                result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, 1000)
                if not result["OK"]:
                    return result
                ce.setProxy(self.proxy, 940)

            ceName = self.queueDict[queue]["CEName"]
            queueName = self.queueDict[queue]["QueueName"]
            ceType = self.queueDict[queue]["CEType"]
            siteName = self.queueDict[queue]["Site"]
            result = pilotAgentsDB.selectPilots(
                {
                    "DestinationSite": ceName,
                    "Queue": queueName,
                    "GridType": ceType,
                    "GridSite": siteName,
                    "OutputReady": "False",
                    "Status": FINAL_PILOT_STATUS,
                }
            )

            if not result["OK"]:
                self.log.error("Failed to select pilots", result["Message"])
                continue
            pilotRefs = result["Value"]
            if not pilotRefs:
                continue
            result = pilotAgentsDB.getPilotInfo(pilotRefs)
            if not result["OK"]:
                self.log.error("Failed to get pilots info from DB", result["Message"])
                continue
            pilotDict = result["Value"]
            if self.getOutput:
                for pRef in pilotRefs:
                    self.log.info("Retrieving output for pilot %s" % pRef)
                    pilotStamp = pilotDict[pRef]["PilotStamp"]
                    pRefStamp = pRef
                    if pilotStamp:
                        pRefStamp = pRef + ":::" + pilotStamp
                    result = ce.getJobOutput(pRefStamp)
                    if not result["OK"]:
                        self.log.error("Failed to get pilot output", "%s: %s" % (ceName, result["Message"]))
                    else:
                        output, error = result["Value"]
                        result = pilotAgentsDB.storePilotOutput(pRef, output, error)
                        if not result["OK"]:
                            self.log.error("Failed to store pilot output", result["Message"])

            # Check if the accounting is to be sent
            if self.sendAccounting:
                result = pilotAgentsDB.selectPilots(
                    {
                        "DestinationSite": ceName,
                        "Queue": queueName,
                        "GridType": ceType,
                        "GridSite": siteName,
                        "AccountingSent": "False",
                        "Status": FINAL_PILOT_STATUS,
                    }
                )

                if not result["OK"]:
                    self.log.error("Failed to select pilots", result["Message"])
                    continue
                pilotRefs = result["Value"]
                if not pilotRefs:
                    continue
                result = pilotAgentsDB.getPilotInfo(pilotRefs)
                if not result["OK"]:
                    self.log.error("Failed to get pilots info from DB", result["Message"])
                    continue
                pilotDict = result["Value"]
                result = self.sendPilotAccounting(pilotDict)
                if not result["OK"]:
                    self.log.error("Failed to send pilot agent accounting")

        return S_OK()