Example #1
0
  def updatePilotStatus( self ):
    """ Update status of pilots in transient states
    """
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      abortedPilots = 0

      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                            'Queue':queueName,
                                            'GridType':ceType,
                                            'GridSite':siteName,
                                            'Status':TRANSIENT_PILOT_STATUS,
                                            'OwnerDN': self.pilotDN,
                                            'OwnerGroup': self.pilotGroup } )
      if not result['OK']:
        self.log.error( 'Failed to select pilots: %s' % result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue

      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']

      stampedPilotRefs = []
      for pRef in pilotDict:
        if pilotDict[pRef]['PilotStamp']:
          stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] )
        else:
          stampedPilotRefs = list( pilotRefs )
          break

      result = ce.isProxyValid()
      if not result['OK']:
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 23400 )
        if not result['OK']:
          return result
        self.proxy = result['Value']
        ce.setProxy( self.proxy, 23300 )

      result = ce.getJobStatus( stampedPilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) )
        continue
      pilotCEDict = result['Value']

      for pRef in pilotRefs:
        newStatus = ''
        oldStatus = pilotDict[pRef]['Status']
        ceStatus = pilotCEDict[pRef]
        lastUpdateTime = pilotDict[pRef]['LastUpdateTime']
        sinceLastUpdate = dateTime() - lastUpdateTime

        if oldStatus == ceStatus and ceStatus != "Unknown":
          # Normal status did not change, continue
          continue
        elif ceStatus == "Unknown" and oldStatus == "Unknown":
          if sinceLastUpdate < 3600*second:
            # Allow 1 hour of Unknown status assuming temporary problems on the CE
            continue
          else:
            newStatus = 'Aborted'
        elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
          # Possible problems on the CE, let's keep the Unknown status for a while
          newStatus = 'Unknown'
        elif ceStatus != 'Unknown' :
          # Update the pilot status to the new value
          newStatus = ceStatus

        if newStatus:
          self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) )
          result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' )
          if newStatus == "Aborted":
            abortedPilots += 1
        # Retrieve the pilot output now
        if newStatus in FINAL_PILOT_STATUS:
          if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput:
            self.log.info( 'Retrieving output for pilot %s' % pRef )
            pilotStamp = pilotDict[pRef]['PilotStamp']
            pRefStamp = pRef
            if pilotStamp:
              pRefStamp = pRef + ':::' + pilotStamp
            result = ce.getJobOutput( pRefStamp )
            if not result['OK']:
              self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
            else:
              output, error = result['Value']
              if output:
                result = pilotAgentsDB.storePilotOutput( pRef, output, error )
                if not result['OK']:
                  self.log.error( 'Failed to store pilot output', result['Message'] )
              else:
                self.log.warn( 'Empty pilot output not stored to PilotDB' )

      # If something wrong in the queue, make a pause for the job submission
      if abortedPilots:
        self.failedQueues[queue] += 1 

    # The pilot can be in Done state set by the job agent check if the output is retrieved
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']

      if not ce.isProxyValid( 120 ):
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 )
        if not result['OK']:
          return result
        ce.setProxy( self.proxy, 940 )

      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                           'Queue':queueName,
                                           'GridType':ceType,
                                           'GridSite':siteName,
                                           'OutputReady':'False',
                                           'Status':FINAL_PILOT_STATUS} )

      if not result['OK']:
        self.log.error( 'Failed to select pilots', result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue
      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']
      if self.getOutput:
        for pRef in pilotRefs:
          self.log.info( 'Retrieving output for pilot %s' % pRef )
          pilotStamp = pilotDict[pRef]['PilotStamp']
          pRefStamp = pRef
          if pilotStamp:
            pRefStamp = pRef + ':::' + pilotStamp
          result = ce.getJobOutput( pRefStamp )
          if not result['OK']:
            self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
          else:
            output, error = result['Value']
            result = pilotAgentsDB.storePilotOutput( pRef, output, error )
            if not result['OK']:
              self.log.error( 'Failed to store pilot output', result['Message'] )

      # Check if the accounting is to be sent
      if self.sendAccounting:
        result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                             'Queue':queueName,
                                             'GridType':ceType,
                                             'GridSite':siteName,
                                             'AccountingSent':'False',
                                             'Status':FINAL_PILOT_STATUS} )

        if not result['OK']:
          self.log.error( 'Failed to select pilots', result['Message'] )
          continue
        pilotRefs = result['Value']
        if not pilotRefs:
          continue
        result = pilotAgentsDB.getPilotInfo( pilotRefs )
        if not result['OK']:
          self.log.error( 'Failed to get pilots info from DB', result['Message'] )
          continue
        pilotDict = result['Value']
        result = self.sendPilotAccounting( pilotDict )
        if not result['OK']:
          self.log.error( 'Failed to send pilot agent accounting' )

    return S_OK()
Example #2
0
    def updatePilotStatus(self):
        """ Update status of pilots in transient states
    """
        for queue in self.queueDict:
            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            queueName = self.queueDict[queue]['QueueName']
            ceType = self.queueDict[queue]['CEType']
            siteName = self.queueDict[queue]['Site']

            result = pilotAgentsDB.selectPilots({
                'DestinationSite': ceName,
                'Queue': queueName,
                'GridType': ceType,
                'GridSite': siteName,
                'Status': TRANSIENT_PILOT_STATUS,
                'OwnerDN': self.pilotDN,
                'OwnerGroup': self.pilotGroup
            })
            if not result['OK']:
                self.log.error('Failed to select pilots: %s' %
                               result['Message'])
                continue
            pilotRefs = result['Value']
            if not pilotRefs:
                continue

            #print "AT >>> pilotRefs", pilotRefs

            result = pilotAgentsDB.getPilotInfo(pilotRefs)
            if not result['OK']:
                self.log.error('Failed to get pilots info from DB',
                               result['Message'])
                continue
            pilotDict = result['Value']

            #print "AT >>> pilotDict", pilotDict

            stampedPilotRefs = []
            for pRef in pilotDict:
                if pilotDict[pRef]['PilotStamp']:
                    stampedPilotRefs.append(pRef + ":::" +
                                            pilotDict[pRef]['PilotStamp'])
                else:
                    stampedPilotRefs = list(pilotRefs)
                    break

            result = ce.isProxyValid()
            if not result['OK']:
                result = gProxyManager.getPilotProxyFromDIRACGroup(
                    self.pilotDN, self.pilotGroup, 600)
                if not result['OK']:
                    return result
                self.proxy = result['Value']
                ce.setProxy(self.proxy, 500)

            result = ce.getJobStatus(stampedPilotRefs)
            if not result['OK']:
                self.log.error('Failed to get pilots status from CE',
                               '%s: %s' % (ceName, result['Message']))
                continue
            pilotCEDict = result['Value']

            #print "AT >>> pilotCEDict", pilotCEDict

            for pRef in pilotRefs:
                newStatus = ''
                oldStatus = pilotDict[pRef]['Status']
                ceStatus = pilotCEDict[pRef]
                if oldStatus == ceStatus:
                    # Status did not change, continue
                    continue
                elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
                    # Pilot finished without reporting, consider it Aborted
                    newStatus = 'Aborted'
                elif ceStatus != 'Unknown':
                    # Update the pilot status to the new value
                    newStatus = ceStatus

                if newStatus:
                    self.log.info('Updating status to %s for pilot %s' %
                                  (newStatus, pRef))
                    result = pilotAgentsDB.setPilotStatus(
                        pRef, newStatus, '', 'Updated by SiteDirector')
                # Retrieve the pilot output now
                if newStatus in FINAL_PILOT_STATUS:
                    if pilotDict[pRef]['OutputReady'].lower(
                    ) == 'false' and self.getOutput:
                        self.log.info('Retrieving output for pilot %s' % pRef)
                        pilotStamp = pilotDict[pRef]['PilotStamp']
                        pRefStamp = pRef
                        if pilotStamp:
                            pRefStamp = pRef + ':::' + pilotStamp
                        result = ce.getJobOutput(pRefStamp)
                        if not result['OK']:
                            self.log.error(
                                'Failed to get pilot output',
                                '%s: %s' % (ceName, result['Message']))
                        else:
                            output, error = result['Value']
                            if output:
                                result = pilotAgentsDB.storePilotOutput(
                                    pRef, output, error)
                                if not result['OK']:
                                    self.log.error(
                                        'Failed to store pilot output',
                                        result['Message'])
                            else:
                                self.log.warn(
                                    'Empty pilot output not stored to PilotDB')

        # The pilot can be in Done state set by the job agent check if the output is retrieved
        for queue in self.queueDict:
            ce = self.queueDict[queue]['CE']

            if not ce.isProxyValid(120):
                result = gProxyManager.getPilotProxyFromDIRACGroup(
                    self.pilotDN, self.pilotGroup, 1000)
                if not result['OK']:
                    return result
                ce.setProxy(self.proxy, 940)

            ceName = self.queueDict[queue]['CEName']
            queueName = self.queueDict[queue]['QueueName']
            ceType = self.queueDict[queue]['CEType']
            siteName = self.queueDict[queue]['Site']
            result = pilotAgentsDB.selectPilots({
                'DestinationSite': ceName,
                'Queue': queueName,
                'GridType': ceType,
                'GridSite': siteName,
                'OutputReady': 'False',
                'Status': FINAL_PILOT_STATUS
            })

            if not result['OK']:
                self.log.error('Failed to select pilots', result['Message'])
                continue
            pilotRefs = result['Value']
            if not pilotRefs:
                continue
            result = pilotAgentsDB.getPilotInfo(pilotRefs)
            if not result['OK']:
                self.log.error('Failed to get pilots info from DB',
                               result['Message'])
                continue
            pilotDict = result['Value']
            if self.getOutput:
                for pRef in pilotRefs:
                    self.log.info('Retrieving output for pilot %s' % pRef)
                    pilotStamp = pilotDict[pRef]['PilotStamp']
                    pRefStamp = pRef
                    if pilotStamp:
                        pRefStamp = pRef + ':::' + pilotStamp
                    result = ce.getJobOutput(pRefStamp)
                    if not result['OK']:
                        self.log.error('Failed to get pilot output',
                                       '%s: %s' % (ceName, result['Message']))
                    else:
                        output, error = result['Value']
                        result = pilotAgentsDB.storePilotOutput(
                            pRef, output, error)
                        if not result['OK']:
                            self.log.error('Failed to store pilot output',
                                           result['Message'])

            # Check if the accounting is to be sent
            if self.sendAccounting:
                result = pilotAgentsDB.selectPilots({
                    'DestinationSite':
                    ceName,
                    'Queue':
                    queueName,
                    'GridType':
                    ceType,
                    'GridSite':
                    siteName,
                    'AccountingSent':
                    'False',
                    'Status':
                    FINAL_PILOT_STATUS
                })

                if not result['OK']:
                    self.log.error('Failed to select pilots',
                                   result['Message'])
                    continue
                pilotRefs = result['Value']
                if not pilotRefs:
                    continue
                result = pilotAgentsDB.getPilotInfo(pilotRefs)
                if not result['OK']:
                    self.log.error('Failed to get pilots info from DB',
                                   result['Message'])
                    continue
                pilotDict = result['Value']
                result = self.sendPilotAccounting(pilotDict)
                if not result['OK']:
                    self.log.error('Failed to send pilot agent accounting')

        return S_OK()
Example #3
0
  def updatePilotStatus( self ):
    """ Update status of pilots in transient states
    """
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']

      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                           'Queue':queueName,
                                           'GridType':ceType,
                                           'GridSite':siteName,
                                           'Status':TRANSIENT_PILOT_STATUS,
                                           'OwnerDN': self.pilotDN,
                                           'OwnerGroup': self.pilotGroup } )
      if not result['OK']:
        self.log.error( 'Failed to select pilots: %s' % result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue

      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']
      stampedPilotRefs = []
      for pRef in pilotDict:
        if pilotDict[pRef]['PilotStamp']:
          stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] )
        else:
          stampedPilotRefs = list( pilotRefs )
          break

      result = ce.isProxyValid()
      if not result['OK']:
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 600 )
        if not result['OK']:
          return result
        self.proxy = result['Value']
        ce.setProxy( self.proxy, 500 )

      result = ce.getJobStatus( stampedPilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) )
        continue
      pilotCEDict = result['Value']

      for pRef in pilotRefs:
        newStatus = ''
        oldStatus = pilotDict[pRef]['Status']
        ceStatus = pilotCEDict[pRef]
        if oldStatus == ceStatus:
          # Status did not change, continue
          continue
        elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
          # Pilot finished without reporting, consider it Aborted
          newStatus = 'Aborted'
        elif ceStatus != 'Unknown' :
          # Update the pilot status to the new value
          newStatus = ceStatus

        if newStatus:
          self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) )
          result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' )
        # Retrieve the pilot output now
        if newStatus in FINAL_PILOT_STATUS:
          if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput:
            self.log.info( 'Retrieving output for pilot %s' % pRef )
            pilotStamp = pilotDict[pRef]['PilotStamp']
            pRefStamp = pRef
            if pilotStamp:
              pRefStamp = pRef + ':::' + pilotStamp
            result = ce.getJobOutput( pRefStamp )
            if not result['OK']:
              self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
            else:
              output, error = result['Value']
              if output:
                result = pilotAgentsDB.storePilotOutput( pRef, output, error )
                if not result['OK']:
                  self.log.error( 'Failed to store pilot output', result['Message'] )
              else:
                self.log.warn( 'Empty pilot output not stored to PilotDB' )

    # The pilot can be in Done state set by the job agent check if the output is retrieved
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']

      if not ce.isProxyValid( 120 ):
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 )
        if not result['OK']:
          return result
        ce.setProxy( self.proxy, 940 )

      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                           'Queue':queueName,
                                           'GridType':ceType,
                                           'GridSite':siteName,
                                           'OutputReady':'False',
                                           'Status':FINAL_PILOT_STATUS} )

      if not result['OK']:
        self.log.error( 'Failed to select pilots', result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue
      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']
      if self.getOutput:
        for pRef in pilotRefs:
          self.log.info( 'Retrieving output for pilot %s' % pRef )
          pilotStamp = pilotDict[pRef]['PilotStamp']
          pRefStamp = pRef
          if pilotStamp:
            pRefStamp = pRef + ':::' + pilotStamp
          result = ce.getJobOutput( pRefStamp )
          if not result['OK']:
            self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
          else:
            output, error = result['Value']
            result = pilotAgentsDB.storePilotOutput( pRef, output, error )
            if not result['OK']:
              self.log.error( 'Failed to store pilot output', result['Message'] )

      # Check if the accounting is to be sent
      if self.sendAccounting:
        result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                             'Queue':queueName,
                                             'GridType':ceType,
                                             'GridSite':siteName,
                                             'AccountingSent':'False',
                                             'Status':FINAL_PILOT_STATUS} )

        if not result['OK']:
          self.log.error( 'Failed to select pilots', result['Message'] )
          continue
        pilotRefs = result['Value']
        if not pilotRefs:
          continue
        result = pilotAgentsDB.getPilotInfo( pilotRefs )
        if not result['OK']:
          self.log.error( 'Failed to get pilots info from DB', result['Message'] )
          continue
        pilotDict = result['Value']
        result = self.sendPilotAccounting( pilotDict )
        if not result['OK']:
          self.log.error( 'Failed to send pilot agent accounting' )

    return S_OK()
Example #4
0
def getGridJobOutput(pilotReference):
    """ Get the pilot job standard output and standard error files for the Grid job reference

      :param str pilotReference: a grid (job) pilot reference
  """

    result = pilotAgentsDB.getPilotInfo(pilotReference)
    if not result['OK'] or not result['Value']:
        return S_ERROR('Failed to get info for pilot ' + pilotReference)

    pilotDict = result['Value'][pilotReference]
    owner = pilotDict['OwnerDN']
    group = pilotDict['OwnerGroup']

    # FIXME: What if the OutputSandBox is not StdOut and StdErr, what do we do with other files?
    result = pilotAgentsDB.getPilotOutput(pilotReference)
    if result['OK']:
        stdout = result['Value']['StdOut']
        error = result['Value']['StdErr']
        if stdout or error:
            resultDict = {}
            resultDict['StdOut'] = stdout
            resultDict['StdErr'] = error
            resultDict['OwnerDN'] = owner
            resultDict['OwnerGroup'] = group
            resultDict['FileList'] = []
            return S_OK(resultDict)
        else:
            gLogger.warn('Empty pilot output found', 'for %s' % pilotReference)

    # Instantiate the appropriate CE
    ceFactory = ComputingElementFactory()
    result = getQueue(pilotDict['GridSite'], pilotDict['DestinationSite'],
                      pilotDict['Queue'])
    if not result['OK']:
        return result
    queueDict = result['Value']
    gridEnv = getGridEnv()
    queueDict['GridEnv'] = gridEnv
    queueDict['WorkingDirectory'] = mkdtemp()
    result = ceFactory.getCE(pilotDict['GridType'],
                             pilotDict['DestinationSite'], queueDict)
    if not result['OK']:
        shutil.rmtree(queueDict['WorkingDirectory'])
        return result
    ce = result['Value']
    groupVOMS = getGroupOption(group, 'VOMSRole', group)
    result = gProxyManager.getPilotProxyFromVOMSGroup(owner, groupVOMS)
    if not result['OK']:
        gLogger.error(
            'Could not get proxy:', 'User "%s" Group "%s" : %s' %
            (owner, groupVOMS, result['Message']))
        return S_ERROR("Failed to get the pilot's owner proxy")
    proxy = result['Value']
    ce.setProxy(proxy)
    pilotStamp = pilotDict['PilotStamp']
    pRef = pilotReference
    if pilotStamp:
        pRef = pRef + ':::' + pilotStamp
    result = ce.getJobOutput(pRef)
    if not result['OK']:
        shutil.rmtree(queueDict['WorkingDirectory'])
        return result
    stdout, error = result['Value']
    if stdout:
        result = pilotAgentsDB.storePilotOutput(pilotReference, stdout, error)
        if not result['OK']:
            gLogger.error('Failed to store pilot output:', result['Message'])

    resultDict = {}
    resultDict['StdOut'] = stdout
    resultDict['StdErr'] = error
    resultDict['OwnerDN'] = owner
    resultDict['OwnerGroup'] = group
    resultDict['FileList'] = []
    shutil.rmtree(queueDict['WorkingDirectory'])
    return S_OK(resultDict)
Example #5
0
  def updatePilotStatus( self ):
    """ Update status of pilots in transient states
    """
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      abortedPilots = 0

      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                            'Queue':queueName,
                                            'GridType':ceType,
                                            'GridSite':siteName,
                                            'Status':TRANSIENT_PILOT_STATUS,
                                            'OwnerDN': self.pilotDN,
                                            'OwnerGroup': self.pilotGroup } )
      if not result['OK']:
        self.log.error( 'Failed to select pilots: %s' % result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue

      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']

      stampedPilotRefs = []
      for pRef in pilotDict:
        if pilotDict[pRef]['PilotStamp']:
          stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] )
        else:
          stampedPilotRefs = list( pilotRefs )
          break

      # This proxy is used for checking the pilot status and renewals
      # We really need at least a few hours otherwise the renewed
      # proxy may expire before we check again...
      result = ce.isProxyValid( 3*3600 )
      if not result['OK']:
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 23400 )
        if not result['OK']:
          return result
        self.proxy = result['Value']
        ce.setProxy( self.proxy, 23300 )

      result = ce.getJobStatus( stampedPilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) )
        continue
      pilotCEDict = result['Value']

      for pRef in pilotRefs:
        newStatus = ''
        oldStatus = pilotDict[pRef]['Status']
        ceStatus = pilotCEDict[pRef]
        lastUpdateTime = pilotDict[pRef]['LastUpdateTime']
        sinceLastUpdate = dateTime() - lastUpdateTime

        if oldStatus == ceStatus and ceStatus != "Unknown":
          # Normal status did not change, continue
          continue
        elif ceStatus == "Unknown" and oldStatus == "Unknown":
          if sinceLastUpdate < 3600*second:
            # Allow 1 hour of Unknown status assuming temporary problems on the CE
            continue
          else:
            newStatus = 'Aborted'
        elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
          # Possible problems on the CE, let's keep the Unknown status for a while
          newStatus = 'Unknown'
        elif ceStatus != 'Unknown' :
          # Update the pilot status to the new value
          newStatus = ceStatus

        if newStatus:
          self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) )
          result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' )
          if newStatus == "Aborted":
            abortedPilots += 1
        # Retrieve the pilot output now
        if newStatus in FINAL_PILOT_STATUS:
          if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput:
            self.log.info( 'Retrieving output for pilot %s' % pRef )
            pilotStamp = pilotDict[pRef]['PilotStamp']
            pRefStamp = pRef
            if pilotStamp:
              pRefStamp = pRef + ':::' + pilotStamp
            result = ce.getJobOutput( pRefStamp )
            if not result['OK']:
              self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
            else:
              output, error = result['Value']
              if output:
                result = pilotAgentsDB.storePilotOutput( pRef, output, error )
                if not result['OK']:
                  self.log.error( 'Failed to store pilot output', result['Message'] )
              else:
                self.log.warn( 'Empty pilot output not stored to PilotDB' )

      # If something wrong in the queue, make a pause for the job submission
      if abortedPilots:
        self.failedQueues[queue] += 1

    # The pilot can be in Done state set by the job agent check if the output is retrieved
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']

      if not ce.isProxyValid( 120 ):
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 )
        if not result['OK']:
          return result
        ce.setProxy( self.proxy, 940 )

      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                            'Queue':queueName,
                                            'GridType':ceType,
                                            'GridSite':siteName,
                                            'OutputReady':'False',
                                            'Status':FINAL_PILOT_STATUS} )

      if not result['OK']:
        self.log.error( 'Failed to select pilots', result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue
      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']
      if self.getOutput:
        for pRef in pilotRefs:
          self.log.info( 'Retrieving output for pilot %s' % pRef )
          pilotStamp = pilotDict[pRef]['PilotStamp']
          pRefStamp = pRef
          if pilotStamp:
            pRefStamp = pRef + ':::' + pilotStamp
          result = ce.getJobOutput( pRefStamp )
          if not result['OK']:
            self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
          else:
            output, error = result['Value']
            result = pilotAgentsDB.storePilotOutput( pRef, output, error )
            if not result['OK']:
              self.log.error( 'Failed to store pilot output', result['Message'] )

      # Check if the accounting is to be sent
      if self.sendAccounting:
        result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                              'Queue':queueName,
                                              'GridType':ceType,
                                              'GridSite':siteName,
                                              'AccountingSent':'False',
                                              'Status':FINAL_PILOT_STATUS} )

        if not result['OK']:
          self.log.error( 'Failed to select pilots', result['Message'] )
          continue
        pilotRefs = result['Value']
        if not pilotRefs:
          continue
        result = pilotAgentsDB.getPilotInfo( pilotRefs )
        if not result['OK']:
          self.log.error( 'Failed to get pilots info from DB', result['Message'] )
          continue
        pilotDict = result['Value']
        result = self.sendPilotAccounting( pilotDict )
        if not result['OK']:
          self.log.error( 'Failed to send pilot agent accounting' )

    return S_OK()
Example #6
0
    def updatePilotStatus(self):
        """ Update status of pilots in transient states
    """
        for queue in self.queueDict:
            ce = self.queueDict[queue]["CE"]
            ceName = self.queueDict[queue]["CEName"]
            queueName = self.queueDict[queue]["QueueName"]
            ceType = self.queueDict[queue]["CEType"]
            siteName = self.queueDict[queue]["Site"]

            result = pilotAgentsDB.selectPilots(
                {
                    "DestinationSite": ceName,
                    "Queue": queueName,
                    "GridType": ceType,
                    "GridSite": siteName,
                    "Status": TRANSIENT_PILOT_STATUS,
                    "OwnerDN": self.pilotDN,
                    "OwnerGroup": self.pilotGroup,
                }
            )
            if not result["OK"]:
                self.log.error("Failed to select pilots: %s" % result["Message"])
                continue
            pilotRefs = result["Value"]
            if not pilotRefs:
                continue

            # print "AT >>> pilotRefs", pilotRefs

            result = pilotAgentsDB.getPilotInfo(pilotRefs)
            if not result["OK"]:
                self.log.error("Failed to get pilots info from DB", result["Message"])
                continue
            pilotDict = result["Value"]

            # print "AT >>> pilotDict", pilotDict

            stampedPilotRefs = []
            for pRef in pilotDict:
                if pilotDict[pRef]["PilotStamp"]:
                    stampedPilotRefs.append(pRef + ":::" + pilotDict[pRef]["PilotStamp"])
                else:
                    stampedPilotRefs = list(pilotRefs)
                    break

            result = ce.isProxyValid()
            if not result["OK"]:
                result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, 600)
                if not result["OK"]:
                    return result
                self.proxy = result["Value"]
                ce.setProxy(self.proxy, 500)

            result = ce.getJobStatus(stampedPilotRefs)
            if not result["OK"]:
                self.log.error("Failed to get pilots status from CE", "%s: %s" % (ceName, result["Message"]))
                continue
            pilotCEDict = result["Value"]

            # print "AT >>> pilotCEDict", pilotCEDict

            for pRef in pilotRefs:
                newStatus = ""
                oldStatus = pilotDict[pRef]["Status"]
                ceStatus = pilotCEDict[pRef]
                if oldStatus == ceStatus:
                    # Status did not change, continue
                    continue
                elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
                    # Pilot finished without reporting, consider it Aborted
                    newStatus = "Aborted"
                elif ceStatus != "Unknown":
                    # Update the pilot status to the new value
                    newStatus = ceStatus

                if newStatus:
                    self.log.info("Updating status to %s for pilot %s" % (newStatus, pRef))
                    result = pilotAgentsDB.setPilotStatus(pRef, newStatus, "", "Updated by SiteDirector")
                # Retrieve the pilot output now
                if newStatus in FINAL_PILOT_STATUS:
                    if pilotDict[pRef]["OutputReady"].lower() == "false" and self.getOutput:
                        self.log.info("Retrieving output for pilot %s" % pRef)
                        pilotStamp = pilotDict[pRef]["PilotStamp"]
                        pRefStamp = pRef
                        if pilotStamp:
                            pRefStamp = pRef + ":::" + pilotStamp
                        result = ce.getJobOutput(pRefStamp)
                        if not result["OK"]:
                            self.log.error("Failed to get pilot output", "%s: %s" % (ceName, result["Message"]))
                        else:
                            output, error = result["Value"]
                            if output:
                                result = pilotAgentsDB.storePilotOutput(pRef, output, error)
                                if not result["OK"]:
                                    self.log.error("Failed to store pilot output", result["Message"])
                            else:
                                self.log.warn("Empty pilot output not stored to PilotDB")

        # The pilot can be in Done state set by the job agent check if the output is retrieved
        for queue in self.queueDict:
            ce = self.queueDict[queue]["CE"]

            if not ce.isProxyValid(120):
                result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, 1000)
                if not result["OK"]:
                    return result
                ce.setProxy(self.proxy, 940)

            ceName = self.queueDict[queue]["CEName"]
            queueName = self.queueDict[queue]["QueueName"]
            ceType = self.queueDict[queue]["CEType"]
            siteName = self.queueDict[queue]["Site"]
            result = pilotAgentsDB.selectPilots(
                {
                    "DestinationSite": ceName,
                    "Queue": queueName,
                    "GridType": ceType,
                    "GridSite": siteName,
                    "OutputReady": "False",
                    "Status": FINAL_PILOT_STATUS,
                }
            )

            if not result["OK"]:
                self.log.error("Failed to select pilots", result["Message"])
                continue
            pilotRefs = result["Value"]
            if not pilotRefs:
                continue
            result = pilotAgentsDB.getPilotInfo(pilotRefs)
            if not result["OK"]:
                self.log.error("Failed to get pilots info from DB", result["Message"])
                continue
            pilotDict = result["Value"]
            if self.getOutput:
                for pRef in pilotRefs:
                    self.log.info("Retrieving output for pilot %s" % pRef)
                    pilotStamp = pilotDict[pRef]["PilotStamp"]
                    pRefStamp = pRef
                    if pilotStamp:
                        pRefStamp = pRef + ":::" + pilotStamp
                    result = ce.getJobOutput(pRefStamp)
                    if not result["OK"]:
                        self.log.error("Failed to get pilot output", "%s: %s" % (ceName, result["Message"]))
                    else:
                        output, error = result["Value"]
                        result = pilotAgentsDB.storePilotOutput(pRef, output, error)
                        if not result["OK"]:
                            self.log.error("Failed to store pilot output", result["Message"])

            # Check if the accounting is to be sent
            if self.sendAccounting:
                result = pilotAgentsDB.selectPilots(
                    {
                        "DestinationSite": ceName,
                        "Queue": queueName,
                        "GridType": ceType,
                        "GridSite": siteName,
                        "AccountingSent": "False",
                        "Status": FINAL_PILOT_STATUS,
                    }
                )

                if not result["OK"]:
                    self.log.error("Failed to select pilots", result["Message"])
                    continue
                pilotRefs = result["Value"]
                if not pilotRefs:
                    continue
                result = pilotAgentsDB.getPilotInfo(pilotRefs)
                if not result["OK"]:
                    self.log.error("Failed to get pilots info from DB", result["Message"])
                    continue
                pilotDict = result["Value"]
                result = self.sendPilotAccounting(pilotDict)
                if not result["OK"]:
                    self.log.error("Failed to send pilot agent accounting")

        return S_OK()