コード例 #1
0
ファイル: RSSCache.py プロジェクト: DIRACGrid-test/DIRAC
class RSSCache( object ):
  '''
  Cache with purgeThread integrated
  '''

  def __init__( self, lifeTime, updateFunc = None, cacheHistoryLifeTime = None ):
    '''
    Constructor
    '''

    self.__lifeTime             = lifeTime
    # lifetime of the history on hours
    self.__cacheHistoryLifeTime = ( 1 and cacheHistoryLifeTime ) or 24
    self.__updateFunc           = updateFunc

    # RSSCache
    self.__rssCache       = DictCache()
    self.__rssCacheStatus = [] # ( updateTime, message )
    self.__rssCacheLock   = threading.Lock()

    # Create purgeThread
    self.__refreshStop    = False
    self.__refreshThread  = threading.Thread( target = self.__refreshCacheThreadRun )
    self.__refreshThread.setDaemon( True )

  def startRefreshThread( self ):
    '''
    Run refresh thread.
    '''
    self.__refreshThread.start()

  def stopRefreshThread( self ):
    '''
    Stop refresh thread.
    '''
    self.__refreshStop = True

  def isCacheAlive( self ):
    '''
    Returns status of the cache refreshing thread
    '''
    return S_OK( self.__refreshThread.isAlive() )

  def setLifeTime( self, lifeTime ):
    '''
    Set cache life time
    '''
    self.__lifeTime = lifeTime

  def setCacheHistoryLifeTime( self, cacheHistoryLifeTime ):
    '''
    Set cache life time
    '''
    self.__cacheHistoryLifeTime = cacheHistoryLifeTime

  def getCacheKeys( self ):
    '''
    List all the keys stored in the cache.
    '''
    self.__rssCacheLock.acquire()
    keys = self.__rssCache.getKeys()
    self.__rssCacheLock.release()

    return S_OK( keys )

  def acquireLock( self ):
    '''
    Acquires RSSCache lock
    '''
    self.__rssCacheLock.acquire()

  def releaseLock( self ):
    '''
    Releases RSSCache lock
    '''
    self.__rssCacheLock.release()

  def getCacheStatus( self ):
    '''
    Return the latest cache status
    '''
    self.__rssCacheLock.acquire()
    if self.__rssCacheStatus:
      res = dict( [ self.__rssCacheStatus[ 0 ] ] )
    else:
      res = {}
    self.__rssCacheLock.release()
    return S_OK( res )

  def getCacheHistory( self ):
    '''
    Return the cache updates history
    '''
    self.__rssCacheLock.acquire()
    res = dict( self.__rssCacheStatus )
    self.__rssCacheLock.release()
    return S_OK( res )

  def get( self, resourceKey ):
    '''
    Gets the resource(s) status(es). Every resource can have multiple statuses,
    so in order to speed up things, we store them on the cache as follows::

      { (<resourceName>,<resourceStatusType0>) : whatever0,
        (<resourceName>,<resourceStatusType1>) : whatever1,
      }

    '''

    #cacheKey = '%s#%s' % ( resourceName, resourceStatusType )

    self.__rssCacheLock.acquire()
    resourceStatus = self.__rssCache.get( resourceKey )
    self.__rssCacheLock.release()

    if resourceStatus:
      return S_OK( { resourceKey : resourceStatus } )
    return S_ERROR( 'Cannot get %s' % resourceKey )

  def getBulk( self, resourceKeys ):
    '''
    Gets values for resourceKeys in one ATOMIC operation.
    '''

    result = {}
    self.__rssCacheLock.acquire()

    for resourceKey in resourceKeys:

      resourceRow = self.__rssCache.get( resourceKey )
      if not resourceRow:
        return S_ERROR( 'Cannot get %s' % resourceKey )
      result.update( { resourceKey : resourceRow } )

    self.__rssCacheLock.release()
    return S_OK( result )

  def resetCache( self ):
    '''
    Reset cache.
    '''
    self.__rssCacheLock.acquire()
    self.__rssCache.purgeAll()
    self.__rssCacheLock.release()

    return S_OK()

  def refreshCache( self ):
    '''
    Clears the cache and gets its latest version, not Thread safe !
    Acquire a lock before using it ! ( and release it afterwards ! )
    '''

    self.__rssCache.purgeAll()

    if self.__updateFunc is None:
      return S_ERROR( 'RSSCache has no updateFunction' )
    newCache = self.__updateFunc()
    if not newCache[ 'OK' ]:
      return newCache

    itemsAdded = self.__updateCache( newCache[ 'Value' ] )

    return itemsAdded

  def refreshCacheAndHistory( self ):
    '''
    Method that refreshes the cache and updates the history. Not thread safe,
    you must acquire a lock before using it, and release it right after !
    '''

    refreshResult = self.refreshCache()

    now = datetime.datetime.utcnow()

    if self.__rssCacheStatus:
      # Check oldest record
      dateInserted, _message = self.__rssCacheStatus[ -1 ]
      if dateInserted < now - datetime.timedelta( hours = self.__cacheHistoryLifeTime ):
        self.__rssCacheStatus.pop()

    self.__rssCacheStatus.insert( 0, ( now, refreshResult ) )

################################################################################
# Private methods

  def __updateCache( self, newCache ):
    '''
    The new cache must be a dictionary, which should look like::

      { ( <resourceName>,<resourceStatusType0>) : whatever0,
        ( <resourceName>,<resourceStatusType1>) : whatever1,
      }

    '''

    itemsCounter = 0

    for cacheKey, cacheValue in newCache.items():
      self.__rssCache.add( cacheKey, self.__lifeTime, value = cacheValue )
      itemsCounter += 1

    return S_OK( itemsCounter )

  def __refreshCacheThreadRun( self ):
    '''
    Method that refreshes periodically the cache.
    '''

    while not self.__refreshStop:

      self.__rssCacheLock.acquire()
      self.refreshCacheAndHistory()
      self.__rssCacheLock.release()

      time.sleep( self.__lifeTime )

    self.__refreshStop = False
コード例 #2
0
class RSSCache:
    """
    Cache with purgeThread integrated
    """
    def __init__(self, lifeTime, updateFunc=None, cacheHistoryLifeTime=None):
        """
        Constructor
        """

        self.__lifeTime = lifeTime
        # lifetime of the history on hours
        self.__cacheHistoryLifeTime = (1 and cacheHistoryLifeTime) or 24
        self.__updateFunc = updateFunc

        # RSSCache
        self.__rssCache = DictCache()
        self.__rssCacheStatus = []  # ( updateTime, message )
        self.__rssCacheLock = threading.Lock()

        # Create purgeThread
        self.__refreshStop = False
        self.__refreshThread = threading.Thread(
            target=self.__refreshCacheThreadRun)
        self.__refreshThread.setDaemon(True)

    def startRefreshThread(self):
        """
        Run refresh thread.
        """
        self.__refreshThread.start()

    def stopRefreshThread(self):
        """
        Stop refresh thread.
        """
        self.__refreshStop = True

    def isCacheAlive(self):
        """
        Returns status of the cache refreshing thread
        """
        return S_OK(self.__refreshThread.is_alive())

    def setLifeTime(self, lifeTime):
        """
        Set cache life time
        """
        self.__lifeTime = lifeTime

    def setCacheHistoryLifeTime(self, cacheHistoryLifeTime):
        """
        Set cache life time
        """
        self.__cacheHistoryLifeTime = cacheHistoryLifeTime

    def getCacheKeys(self):
        """
        List all the keys stored in the cache.
        """
        self.__rssCacheLock.acquire()
        keys = self.__rssCache.getKeys()
        self.__rssCacheLock.release()

        return S_OK(keys)

    def acquireLock(self):
        """
        Acquires RSSCache lock
        """
        self.__rssCacheLock.acquire()

    def releaseLock(self):
        """
        Releases RSSCache lock
        """
        self.__rssCacheLock.release()

    def getCacheStatus(self):
        """
        Return the latest cache status
        """
        self.__rssCacheLock.acquire()
        if self.__rssCacheStatus:
            res = dict([self.__rssCacheStatus[0]])
        else:
            res = {}
        self.__rssCacheLock.release()
        return S_OK(res)

    def getCacheHistory(self):
        """
        Return the cache updates history
        """
        self.__rssCacheLock.acquire()
        res = dict(self.__rssCacheStatus)
        self.__rssCacheLock.release()
        return S_OK(res)

    def get(self, resourceKey):
        """
        Gets the resource(s) status(es). Every resource can have multiple statuses,
        so in order to speed up things, we store them on the cache as follows::

          { (<resourceName>,<resourceStatusType0>) : whatever0,
            (<resourceName>,<resourceStatusType1>) : whatever1,
          }

        """

        # cacheKey = '%s#%s' % ( resourceName, resourceStatusType )

        self.__rssCacheLock.acquire()
        resourceStatus = self.__rssCache.get(resourceKey)
        self.__rssCacheLock.release()

        if resourceStatus:
            return S_OK({resourceKey: resourceStatus})
        return S_ERROR("Cannot get %s" % resourceKey)

    def getBulk(self, resourceKeys):
        """
        Gets values for resourceKeys in one ATOMIC operation.
        """

        result = {}
        self.__rssCacheLock.acquire()

        for resourceKey in resourceKeys:

            resourceRow = self.__rssCache.get(resourceKey)
            if not resourceRow:
                return S_ERROR("Cannot get %s" % resourceKey)
            result.update({resourceKey: resourceRow})

        self.__rssCacheLock.release()
        return S_OK(result)

    def resetCache(self):
        """
        Reset cache.
        """
        self.__rssCacheLock.acquire()
        self.__rssCache.purgeAll()
        self.__rssCacheLock.release()

        return S_OK()

    def refreshCache(self):
        """
        Clears the cache and gets its latest version, not Thread safe !
        Acquire a lock before using it ! ( and release it afterwards ! )
        """

        self.__rssCache.purgeAll()

        if self.__updateFunc is None:
            return S_ERROR("RSSCache has no updateFunction")
        newCache = self.__updateFunc()
        if not newCache["OK"]:
            return newCache

        itemsAdded = self.__updateCache(newCache["Value"])

        return itemsAdded

    def refreshCacheAndHistory(self):
        """
        Method that refreshes the cache and updates the history. Not thread safe,
        you must acquire a lock before using it, and release it right after !
        """

        refreshResult = self.refreshCache()

        now = datetime.datetime.utcnow()

        if self.__rssCacheStatus:
            # Check oldest record
            dateInserted, _message = self.__rssCacheStatus[-1]
            if dateInserted < now - datetime.timedelta(
                    hours=self.__cacheHistoryLifeTime):
                self.__rssCacheStatus.pop()

        self.__rssCacheStatus.insert(0, (now, refreshResult))

    ################################################################################
    # Private methods

    def __updateCache(self, newCache):
        """
        The new cache must be a dictionary, which should look like::

          { ( <resourceName>,<resourceStatusType0>) : whatever0,
            ( <resourceName>,<resourceStatusType1>) : whatever1,
          }

        """

        itemsCounter = 0

        for cacheKey, cacheValue in newCache.items():
            self.__rssCache.add(cacheKey, self.__lifeTime, value=cacheValue)
            itemsCounter += 1

        return S_OK(itemsCounter)

    def __refreshCacheThreadRun(self):
        """
        Method that refreshes periodically the cache.
        """

        while not self.__refreshStop:

            self.__rssCacheLock.acquire()
            self.refreshCacheAndHistory()
            self.__rssCacheLock.release()

            time.sleep(self.__lifeTime)

        self.__refreshStop = False
コード例 #3
0
class Cache(object):
    """
    Cache basic class.
    
    WARNING: None of its methods is thread safe. Acquire / Release lock when
    using them !
  """
    def __init__(self, lifeTime, updateFunc):
        """
    Constructor
    
    :Parameters:
      **lifeTime** - `int`
        Lifetime of the elements in the cache ( seconds ! )
      **updateFunc** - `function`
        This function MUST return a S_OK | S_ERROR object. In the case of the first,
        its value must be a dictionary.
    
    """

        # We set a 20% of the lifetime randomly, so that if we have thousands of jobs
        # starting at the same time, all the caches will not end at the same time.
        randomLifeTimeBias = 0.2 * random.random()

        self.log = gLogger.getSubLogger(self.__class__.__name__)

        self.__lifeTime = int(lifeTime * (1 + randomLifeTimeBias))
        self.__updateFunc = updateFunc
        # The records returned from the cache must be valid at least 30 seconds.
        self.__validSeconds = 30

        # Cache
        self.__cache = DictCache()
        self.__cacheLock = LockRing()
        self.__cacheLock.getLock(self.__class__.__name__)

    #.............................................................................
    # internal cache object getter

    def cacheKeys(self):
        """
    Cache keys getter
      
    :returns: list with valid keys on the cache
    """

        return self.__cache.getKeys(validSeconds=self.__validSeconds)

    #.............................................................................
    # acquire / release Locks

    def acquireLock(self):
        """
    Acquires Cache lock
    """

        self.__cacheLock.acquire(self.__class__.__name__)

    def releaseLock(self):
        """
    Releases Cache lock
    """

        self.__cacheLock.release(self.__class__.__name__)

    #.............................................................................
    # Cache getters

    def get(self, cacheKeys):
        """
    Gets values for cacheKeys given, if all are found ( present on the cache and
    valid ), returns S_OK with the results. If any is not neither present not
    valid, returns S_ERROR. 
    
    :Parameters:
      **cacheKeys** - `list`
        list of keys to be extracted from the cache
        
    :return: S_OK | S_ERROR
    """

        result = {}

        for cacheKey in cacheKeys:

            cacheRow = self.__cache.get(cacheKey,
                                        validSeconds=self.__validSeconds)
            if not cacheRow:
                self.log.error(str(cacheKey))
                return S_ERROR('Cannot get %s' % str(cacheKey))
            result.update({cacheKey: cacheRow})

        return S_OK(result)

    #.............................................................................
    # Cache refreshers

    def refreshCache(self):
        """     
    Purges the cache and gets fresh data from the update function.
    
    :return: S_OK | S_ERROR. If the first, its content is the new cache.    
    """

        self.log.verbose('refreshing...')

        self.__cache.purgeAll()

        newCache = self.__updateFunc()
        if not newCache['OK']:
            self.log.error(newCache['Message'])
            return newCache

        newCache = self.__updateCache(newCache['Value'])

        self.log.verbose('refreshed')

        return newCache

    #.............................................................................
    # Private methods

    def __updateCache(self, newCache):
        """
    Given the new cache dictionary, updates the internal cache with it. It sets
    a duration to the entries of <self.__lifeTime> seconds.
    
    :Parameters:
      **newCache** - `dict`
        dictionary containing a new cache
    
    :return: dictionary. It is newCache argument.    
    """

        for cacheKey, cacheValue in newCache.items():
            self.__cache.add(cacheKey, self.__lifeTime, value=cacheValue)

        # We are assuming nothing will fail while inserting in the cache. There is
        # no apparent reason to suspect from that piece of code.
        return S_OK(newCache)
コード例 #4
0
class DIRACPilotDirector(PilotDirector):
    """
    DIRAC PilotDirector class
  """
    def __init__(self, submitPool):
        """
     Define some defaults and call parent __init__
    """
        self.gridMiddleware = 'DIRAC'

        PilotDirector.__init__(self, submitPool)

        self.computingElementList = COMPUTING_ELEMENTS
        self.computingElementDict = {}
        self.addComputingElement(self.computingElementList)

        self.siteName = gConfig.getValue('/LocalSite/Site', '')
        if not self.siteName:
            self.log.error(
                'Can not run a Director if Site Name is not defined')
            sys.exit()

        self.__failingCECache = DictCache()
        self.__ticketsCECache = DictCache()

    def configure(self, csSection, submitPool):
        """
     Here goes common configuration for DIRAC PilotDirector
    """

        PilotDirector.configure(self, csSection, submitPool)
        self.reloadConfiguration(csSection, submitPool)

        self.__failingCECache.purgeExpired()
        self.__ticketsCECache.purgeExpired()

        for ce in self.__failingCECache.getKeys():
            if ce in self.computingElementDict.keys():
                try:
                    del self.computingElementDict[ce]
                except:
                    pass
        if self.computingElementDict:
            self.log.info(' ComputingElements:',
                          ', '.join(self.computingElementDict.keys()))
        else:
            return

        # FIXME: this is to start testing
        _ceName, computingElementDict = self.computingElementDict.items()[0]

        self.computingElement = computingElementDict['CE']

        self.log.debug(self.computingElement.getCEStatus())

        self.log.info(' SiteName:', self.siteName)

    def configureFromSection(self, mySection):
        """
      reload from CS
    """
        PilotDirector.configureFromSection(self, mySection)

        self.computingElementList = gConfig.getValue(
            mySection + '/ComputingElements', self.computingElementList)
        self.addComputingElement(self.computingElementList)

        self.siteName = gConfig.getValue(mySection + '/SiteName',
                                         self.siteName)

    def addComputingElement(self, ceList):
        """
      Check if a CE object for the current CE is available,
      instantiate one if necessary
    """
        for CE in ceList:
            if CE not in self.computingElementDict:
                ceFactory = ComputingElementFactory()
                ceInstance = ceFactory.getCE(ceName=CE)
                if not ceInstance['OK']:
                    self.log.error('Can not create CE object:',
                                   ceInstance['Message'])
                    return
                self.computingElementDict[CE] = ceInstance[
                    'Value'].ceConfigDict
                # add the 'CE' instance at the end to avoid being overwritten
                self.computingElementDict[CE]['CE'] = ceInstance['Value']

    def _submitPilots(self, workDir, taskQueueDict, pilotOptions,
                      pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ,
                      proxy, pilotsPerJob):
        """
      This method does the actual pilot submission to the DIRAC CE
      The logic is as follows:
      - If there are no available CE it return error
      - If there is no queue available in the CE's, it returns error
      - It creates a temp directory
      - It prepare a PilotScript
    """

        taskQueueID = taskQueueDict['TaskQueueID']
        #     ownerDN = taskQueueDict['OwnerDN']

        submittedPilots = 0

        # if self.computingElement not in self.computingElementDict:
        #  # Since we can exclude CEs from the list, it may become empty
        #  return S_ERROR( ERROR_CE )

        pilotRequirements = []
        pilotRequirements.append(('CPUTime', taskQueueDict['CPUTime']))
        # do we need to care about anything else?
        pilotRequirementsString = str(pilotRequirements)

        # Check that there are available queues for the Jobs:
        if self.enableListMatch:
            availableQueues = []
            # now = Time.dateTime()
            cachedAvailableQueues = self.listMatchCache.get(
                pilotRequirementsString)
            if cachedAvailableQueues is None:
                availableQueues = self._listQueues(pilotRequirements)
                if availableQueues != False:
                    self.listMatchCache.add(pilotRequirementsString,
                                            self.listMatchDelay,
                                            availableQueues)
                    self.log.verbose(
                        'Available Queues for TaskQueue ',
                        "%s: %s" % (taskQueueID, str(availableQueues)))
            else:
                availableQueues = cachedAvailableQueues

        if not availableQueues:
            return S_ERROR(ERROR_CE + ' TQ: %d' % taskQueueID)

        baseDir = os.getcwd()
        workingDirectory = tempfile.mkdtemp(prefix='TQ_%s_' % taskQueueID,
                                            dir=workDir)
        self.log.verbose('Using working Directory:', workingDirectory)
        os.chdir(workingDirectory)

        # set the Site Name
        pilotOptions.append("-n '%s'" % self.siteName)

        # submit pilots for every CE available

        for CE in self.computingElementDict.keys():
            ceName = CE
            computingElement = self.computingElementDict[CE]['CE']

            # add possible requirements from Site and CE
            for req, val in getResourceDict(ceName).items():
                pilotOptions.append("-o '/AgentJobRequirements/%s=%s'" %
                                    (req, val))

            ceConfigDict = self.computingElementDict[CE]

            httpProxy = ''
            if 'HttpProxy' in ceConfigDict:
                httpProxy = ceConfigDict['HttpProxy']

            if 'JobExecDir' in ceConfigDict:
                pilotExecDir = ceConfigDict['JobExecDir']

            try:
                pilotScript = self._writePilotScript(workingDirectory,
                                                     pilotOptions, proxy,
                                                     httpProxy, pilotExecDir)
            except:
                self.log.exception(ERROR_SCRIPT)
                try:
                    os.chdir(baseDir)
                    shutil.rmtree(workingDirectory)
                except:
                    pass
                return S_ERROR(ERROR_SCRIPT)

            self.log.info("Pilots to submit: ", pilotsToSubmit)
            while submittedPilots < pilotsToSubmit:
                # Find out how many pilots can be submitted
                ret = computingElement.available()
                if not ret['OK']:
                    self.log.error(
                        'Can not determine if pilot should be submitted: ',
                        ret['Message'])
                    break
                maxPilotsToSubmit = ret['Value']
                self.log.info("Submit Pilots: ", maxPilotsToSubmit)
                if not maxPilotsToSubmit:
                    break
                # submit the pilots and then check again
                for _i in range(
                        min(maxPilotsToSubmit,
                            pilotsToSubmit - submittedPilots)):
                    submission = computingElement.submitJob(
                        pilotScript, '', '')
                    if not submission['OK']:
                        self.log.error('Pilot submission failed: ',
                                       submission['Message'])
                        # cleanup
                        try:
                            os.chdir(baseDir)
                            shutil.rmtree(workingDirectory)
                        except:
                            pass
                        return S_ERROR('Pilot submission failed after ' +
                                       str(submittedPilots) +
                                       ' pilots submitted successful')
                    submittedPilots += 1
                    # let the batch system some time to digest the submitted job
                    time.sleep(1)

            #next CE

        try:
            os.chdir(baseDir)
            shutil.rmtree(workingDirectory)
        except:
            pass

        return S_OK(submittedPilots)

    def _listQueues(self, pilotRequirements):
        """
     For each defined CE return the list of Queues with available, running and waiting slots,
     matching the requirements of the pilots.
     Currently only CPU time is considered
    """
        result = self.computingElement.available(pilotRequirements)
        if not result['OK']:
            self.log.error('Can not determine available queues',
                           result['Message'])
            return False
        return result['Value']

    def _writePilotScript(self, workingDirectory, pilotOptions, proxy,
                          httpProxy, pilotExecDir):
        """
     Prepare the script to execute the pilot
     For the moment it will do like Grid Pilots, a full DIRAC installation

     It assumes that the pilot script will have access to the submit working directory
    """
        try:
            compressedAndEncodedProxy = base64.encodestring(
                bz2.compress(proxy.dumpAllToString()['Value'])).replace(
                    '\n', '')
            compressedAndEncodedPilot = base64.encodestring(
                bz2.compress(open(self.pilot, "rb").read(),
                             9)).replace('\n', '')
            compressedAndEncodedInstall = base64.encodestring(
                bz2.compress(open(self.install, "rb").read(),
                             9)).replace('\n', '')
        except:
            self.log.exception(
                'Exception during file compression of proxy, dirac-pilot or dirac-install'
            )
            return S_ERROR(
                'Exception during file compression of proxy, dirac-pilot or dirac-install'
            )

        localPilot = """#!/bin/bash
/usr/bin/env python << EOF
#
import os, stat, tempfile, sys, shutil, base64, bz2
try:
  pilotExecDir = '%(pilotExecDir)s'
  if not pilotExecDir:
    pilotExecDir = None
  pilotWorkingDirectory = tempfile.mkdtemp( suffix = 'pilot', prefix = 'DIRAC_', dir = pilotExecDir )
  os.chdir( pilotWorkingDirectory )
  open( 'proxy', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedProxy)s" ) ) )
  open( '%(pilotScript)s', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedPilot)s" ) ) )
  open( '%(installScript)s', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedInstall)s" ) ) )
  os.chmod("proxy", stat.S_IRUSR | stat.S_IWUSR)
  os.chmod("%(pilotScript)s", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
  os.chmod("%(installScript)s", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
  if "LD_LIBRARY_PATH" not in os.environ:
    os.environ["LD_LIBRARY_PATH"]=""
  os.environ["X509_USER_PROXY"]=os.path.join(pilotWorkingDirectory, 'proxy')
  if "%(httpProxy)s":
    os.environ["HTTP_PROXY"]="%(httpProxy)s"
  os.environ["X509_CERT_DIR"]=os.path.join(pilotWorkingDirectory, 'etc/grid-security/certificates')
  # TODO: structure the output
  print '==========================================================='
  print 'Environment of execution host'
  for key in os.environ.keys():
    print key + '=' + os.environ[key]
  print '==========================================================='
except Exception, x:
  print >> sys.stderr, x
  sys.exit(-1)
cmd = "python %(pilotScript)s %(pilotOptions)s"
print 'Executing: ', cmd
sys.stdout.flush()
os.system( cmd )

shutil.rmtree( pilotWorkingDirectory )

EOF
""" % {
            'compressedAndEncodedProxy': compressedAndEncodedProxy,
            'compressedAndEncodedPilot': compressedAndEncodedPilot,
            'compressedAndEncodedInstall': compressedAndEncodedInstall,
            'httpProxy': httpProxy,
            'pilotScript': os.path.basename(self.pilot),
            'installScript': os.path.basename(self.install),
            'pilotOptions': ' '.join(pilotOptions),
            'pilotExecDir': pilotExecDir
        }

        fd, name = tempfile.mkstemp(suffix='_pilotwrapper.py',
                                    prefix='DIRAC_',
                                    dir=workingDirectory)
        pilotWrapper = os.fdopen(fd, 'w')
        pilotWrapper.write(localPilot)
        pilotWrapper.close()

        return name

    def _getPilotProxyFromDIRACGroup(self, ownerDN, ownerGroup,
                                     requiredTimeLeft):
        """
    Download a limited pilot proxy with VOMS extensions depending on the group
    """
        #Assign VOMS attribute
        vomsAttr = CS.getVOMSAttributeForGroup(ownerGroup)
        if not vomsAttr:
            self.log.info(
                "Downloading a proxy without VOMS extensions for %s@%s" %
                (ownerDN, ownerGroup))
            return gProxyManager.downloadProxy(
                ownerDN,
                ownerGroup,
                limited=True,
                requiredTimeLeft=requiredTimeLeft)
        else:
            self.log.info(
                "Downloading a proxy with '%s' VOMS extension for %s@%s" %
                (vomsAttr, ownerDN, ownerGroup))
            return gProxyManager.downloadVOMSProxy(
                ownerDN,
                ownerGroup,
                limited=True,
                requiredTimeLeft=requiredTimeLeft,
                requiredVOMSAttribute=vomsAttr)
コード例 #5
0
ファイル: DIRACPilotDirector.py プロジェクト: CinziaLu/DIRAC
class DIRACPilotDirector(PilotDirector):
  """
    DIRAC PilotDirector class
  """
  def __init__( self, submitPool ):
    """
     Define some defaults and call parent __init__
    """
    self.gridMiddleware    = 'DIRAC'

    PilotDirector.__init__( self, submitPool )

    self.computingElementList = COMPUTING_ELEMENTS
    self.computingElementDict = {}
    self.addComputingElement( self.computingElementList )

    self.siteName          = gConfig.getValue('/LocalSite/Site','')
    if not self.siteName:
      self.log.error( 'Can not run a Director if Site Name is not defined' )
      sys.exit()

    self.__failingCECache  = DictCache()
    self.__ticketsCECache  = DictCache()

  def configure(self, csSection, submitPool ):
    """
     Here goes common configuration for DIRAC PilotDirector
    """

    PilotDirector.configure( self, csSection, submitPool )
    self.reloadConfiguration( csSection, submitPool )

    self.__failingCECache.purgeExpired()
    self.__ticketsCECache.purgeExpired()

    for ce in self.__failingCECache.getKeys():
      if ce in self.computingElementDict.keys():
        try:
          del self.computingElementDict[ce]
        except:
          pass
    if self.computingElementDict:
      self.log.info( ' ComputingElements:', ', '.join(self.computingElementDict.keys()) )
    else:
      return

    # FIXME: this is to start testing
    _ceName, computingElementDict = self.computingElementDict.items()[0]

    self.computingElement = computingElementDict['CE']

    self.log.debug( self.computingElement.getCEStatus() )

    self.log.info( ' SiteName:', self.siteName )


  def configureFromSection( self, mySection ):
    """
      reload from CS
    """
    PilotDirector.configureFromSection( self, mySection )

    self.computingElementList = gConfig.getValue( mySection+'/ComputingElements'      , self.computingElementList )
    self.addComputingElement( self.computingElementList )

    self.siteName             = gConfig.getValue( mySection+'/SiteName'               , self.siteName )


  def addComputingElement(self, ceList):
    """
      Check if a CE object for the current CE is available,
      instantiate one if necessary
    """
    for CE in ceList:
      if CE not in self.computingElementDict:
        ceFactory = ComputingElementFactory( )
        ceInstance = ceFactory.getCE( ceName = CE )
        if not ceInstance['OK']:
          self.log.error('Can not create CE object:', ceInstance['Message'])
          return
        self.computingElementDict[CE] = ceInstance['Value'].ceConfigDict
        # add the 'CE' instance at the end to avoid being overwritten
        self.computingElementDict[CE]['CE'] = ceInstance['Value']


  def _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit,
                     ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ):
    """
      This method does the actual pilot submission to the DIRAC CE
      The logic is as follows:
      - If there are no available CE it return error
      - If there is no queue available in the CE's, it returns error
      - It creates a temp directory
      - It prepare a PilotScript
    """

    taskQueueID = taskQueueDict['TaskQueueID']
#     ownerDN = taskQueueDict['OwnerDN']

    submittedPilots = 0

    # if self.computingElement not in self.computingElementDict:
    #  # Since we can exclude CEs from the list, it may become empty
    #  return S_ERROR( ERROR_CE )

    pilotRequirements = []
    pilotRequirements.append( ( 'CPUTime', taskQueueDict['CPUTime'] ) )
    # do we need to care about anything else?
    pilotRequirementsString = str( pilotRequirements )

    # Check that there are available queues for the Jobs:
    if self.enableListMatch:
      availableQueues = []
      # now = Time.dateTime()
      cachedAvailableQueues = self.listMatchCache.get( pilotRequirementsString )
      if cachedAvailableQueues is None:
        availableQueues = self._listQueues( pilotRequirements )
        if availableQueues != False:
          self.listMatchCache.add( pilotRequirementsString, self.listMatchDelay, availableQueues )
          self.log.verbose( 'Available Queues for TaskQueue ',  "%s: %s" % ( taskQueueID, str(availableQueues) ) )
      else:
        availableQueues = cachedAvailableQueues

    if not availableQueues:
      return S_ERROR( ERROR_CE + ' TQ: %d' % taskQueueID )

    baseDir = os.getcwd()
    workingDirectory = tempfile.mkdtemp( prefix= 'TQ_%s_' % taskQueueID, dir = workDir )
    self.log.verbose( 'Using working Directory:', workingDirectory )
    os.chdir( workingDirectory )

    # set the Site Name
    pilotOptions.append( "-n '%s'" % self.siteName)

    # submit pilots for every CE available

    for CE in self.computingElementDict.keys():
      ceName = CE
      computingElement = self.computingElementDict[CE]['CE']

      # add possible requirements from Site and CE
      for req, val in getResourceDict( ceName ).items():
        pilotOptions.append( "-o '/AgentJobRequirements/%s=%s'" % ( req, val ) )

      ceConfigDict = self.computingElementDict[CE]

      if 'ClientPlatform' in ceConfigDict:
        pilotOptions.append( "-p '%s'" % ceConfigDict['ClientPlatform'])

      if 'SharedArea' in ceConfigDict:
        pilotOptions.append( "-o '/LocalSite/SharedArea=%s'" % ceConfigDict['SharedArea'] )

#       if 'CPUScalingFactor' in ceConfigDict:
#         pilotOptions.append( "-o '/LocalSite/CPUScalingFactor=%s'" % ceConfigDict['CPUScalingFactor'] )
#
#       if 'CPUNormalizationFactor' in ceConfigDict:
#         pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % ceConfigDict['CPUNormalizationFactor'] )

        self.log.info( "pilotOptions: ", ' '.join(pilotOptions))

      httpProxy = ''
      if 'HttpProxy' in ceConfigDict:
        httpProxy = ceConfigDict['HttpProxy']

      if 'JobExecDir' in ceConfigDict:
        pilotExecDir = ceConfigDict['JobExecDir']

      try:
        pilotScript = self._writePilotScript( workingDirectory, pilotOptions, proxy, httpProxy, pilotExecDir )
      except:
        self.log.exception( ERROR_SCRIPT )
        try:
          os.chdir( baseDir )
          shutil.rmtree( workingDirectory )
        except:
          pass
        return S_ERROR( ERROR_SCRIPT )

      self.log.info("Pilots to submit: ", pilotsToSubmit)
      while submittedPilots < pilotsToSubmit:
        # Find out how many pilots can be submitted
        ret = computingElement.available( )
        if not ret['OK']:
          self.log.error('Can not determine if pilot should be submitted: ', ret['Message'])
          break
        maxPilotsToSubmit = ret['Value']
        self.log.info("Submit Pilots: ", maxPilotsToSubmit)
        if not maxPilotsToSubmit:
          break
        # submit the pilots and then check again
        for _i in range( min( maxPilotsToSubmit, pilotsToSubmit - submittedPilots ) ):
          submission = computingElement.submitJob(pilotScript, '', '')
          if not submission['OK']:
            self.log.error('Pilot submission failed: ', submission['Message'])
            # cleanup
            try:
              os.chdir( baseDir )
              shutil.rmtree( workingDirectory )
            except:
              pass
            return S_ERROR('Pilot submission failed after ' + str(submittedPilots) + ' pilots submitted successful')
          submittedPilots += 1
          # let the batch system some time to digest the submitted job
          time.sleep(1)

      #next CE

    try:
      os.chdir( baseDir )
      shutil.rmtree( workingDirectory )
    except:
      pass

    return S_OK(submittedPilots)

  def _listQueues( self, pilotRequirements ):
    """
     For each defined CE return the list of Queues with available, running and waiting slots,
     matching the requirements of the pilots.
     Currently only CPU time is considered
    """
    result = self.computingElement.available( pilotRequirements )
    if not result['OK']:
      self.log.error( 'Can not determine available queues', result['Message'] )
      return False
    return result['Value']


  def _writePilotScript( self, workingDirectory, pilotOptions, proxy, httpProxy, pilotExecDir ):
    """
     Prepare the script to execute the pilot
     For the moment it will do like Grid Pilots, a full DIRAC installation

     It assumes that the pilot script will have access to the submit working directory
    """
    try:
      compressedAndEncodedProxy = base64.encodestring( bz2.compress( proxy.dumpAllToString()['Value'] ) ).replace('\n','')
      compressedAndEncodedPilot = base64.encodestring( bz2.compress( open( self.pilot, "rb" ).read(), 9 ) ).replace('\n','')
      compressedAndEncodedInstall = base64.encodestring( bz2.compress( open( self.install, "rb" ).read(), 9 ) ).replace('\n','')
    except:
      self.log.exception('Exception during file compression of proxy, dirac-pilot or dirac-install')
      return S_ERROR('Exception during file compression of proxy, dirac-pilot or dirac-install')

    localPilot = """#!/bin/bash
/usr/bin/env python << EOF
#
import os, stat, tempfile, sys, shutil, base64, bz2
try:
  pilotExecDir = '%(pilotExecDir)s'
  if not pilotExecDir:
    pilotExecDir = None
  pilotWorkingDirectory = tempfile.mkdtemp( suffix = 'pilot', prefix = 'DIRAC_', dir = pilotExecDir )
  os.chdir( pilotWorkingDirectory )
  open( 'proxy', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedProxy)s" ) ) )
  open( '%(pilotScript)s', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedPilot)s" ) ) )
  open( '%(installScript)s', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedInstall)s" ) ) )
  os.chmod("proxy", stat.S_IRUSR | stat.S_IWUSR)
  os.chmod("%(pilotScript)s", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
  os.chmod("%(installScript)s", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
  if "LD_LIBRARY_PATH" not in os.environ:
    os.environ["LD_LIBRARY_PATH"]=""
  os.environ["X509_USER_PROXY"]=os.path.join(pilotWorkingDirectory, 'proxy')
  if "%(httpProxy)s":
    os.environ["HTTP_PROXY"]="%(httpProxy)s"
  os.environ["X509_CERT_DIR"]=os.path.join(pilotWorkingDirectory, 'etc/grid-security/certificates')
  # TODO: structure the output
  print '==========================================================='
  print 'Environment of execution host'
  for key in os.environ.keys():
    print key + '=' + os.environ[key]
  print '==========================================================='
except Exception, x:
  print >> sys.stderr, x
  sys.exit(-1)
cmd = "python %(pilotScript)s %(pilotOptions)s"
print 'Executing: ', cmd
sys.stdout.flush()
os.system( cmd )

shutil.rmtree( pilotWorkingDirectory )

EOF
""" % { 'compressedAndEncodedProxy': compressedAndEncodedProxy,
        'compressedAndEncodedPilot': compressedAndEncodedPilot,
        'compressedAndEncodedInstall': compressedAndEncodedInstall,
        'httpProxy': httpProxy,
        'pilotScript': os.path.basename(self.pilot),
        'installScript': os.path.basename(self.install),
        'pilotOptions': ' '.join( pilotOptions ),
        'pilotExecDir': pilotExecDir }

    fd, name = tempfile.mkstemp( suffix = '_pilotwrapper.py', prefix = 'DIRAC_', dir=workingDirectory)
    pilotWrapper = os.fdopen(fd, 'w')
    pilotWrapper.write( localPilot )
    pilotWrapper.close()

    return name

  def _getPilotProxyFromDIRACGroup( self, ownerDN, ownerGroup, requiredTimeLeft ):
    """
    Download a limited pilot proxy with VOMS extensions depending on the group
    """
    #Assign VOMS attribute
    vomsAttr = CS.getVOMSAttributeForGroup( ownerGroup )
    if not vomsAttr:
      self.log.info( "Downloading a proxy without VOMS extensions for %s@%s" % ( ownerDN, ownerGroup ) )
      return gProxyManager.downloadProxy( ownerDN, ownerGroup, limited = True,
                                          requiredTimeLeft = requiredTimeLeft )
    else:
      self.log.info( "Downloading a proxy with '%s' VOMS extension for %s@%s" % ( vomsAttr, ownerDN, ownerGroup ) )
      return gProxyManager.downloadVOMSProxy( ownerDN,
                                     ownerGroup,
                                     limited = True,
                                     requiredTimeLeft = requiredTimeLeft,
                                     requiredVOMSAttribute = vomsAttr )
コード例 #6
0
class CredentialsClient:

  CONSUMER_GRACE_TIME = 3600
  REQUEST_GRACE_TIME = 900

  def __init__( self, RPCFunctor = None ):
    if not RPCFunctor:
      self.__RPCFunctor = RPCClient
    else:
      self.__RPCFunctor = RPCFunctor
    self.__tokens = DictCache()
    self.__requests = DictCache()
    self.__consumers = DictCache( deleteFunction = self.__cleanConsumerCache )

  def __getRPC( self ):
    return self.__RPCFunctor( "WebAPI/Credentials" )

  def __cleanReturn( self, result ):
    if 'rpcStub' in result:
      result.pop( 'rpcStub' )
    return result

  ##
  # Consumer
  ##

  def generateConsumerPair( self, name, callback, icon, consumerKey = "" ):
    result = self.__getRPC().generateConsumerPair( name, callback, icon, consumerKey )
    if not result[ 'OK' ]:
      return self.__cleanReturn( result )
    self.__consumers.add( consumerKey, self.CONSUMER_GRACE_TIME, result[ 'Value' ] )
    return self.__cleanReturn( result )

  def getConsumerData( self, consumerKey ):
    cData = self.__consumers.get( consumerKey )
    if cData:
      return S_OK( cData )
    result = self.__getRPC().getConsumerData( consumerKey )
    if not result[ 'OK' ]:
      return self.__cleanReturn( result )
    self.__consumers.add( consumerKey, self.CONSUMER_GRACE_TIME, result[ 'Value' ] )
    return self.__cleanReturn( result )

  def deleteConsumer( self, consumerKey ):
    self.__consumers.delete( consumerKey )
    result = self.__getRPC().deleteConsumer( consumerKey )
    if result[ 'OK' ]:
      self.__cleanConsumerCache( { 'key' : consumerKey } )
    return self.__cleanReturn( result )

  def getAllConsumers( self ):
    result = self.__getRPC().getAllConsumers()
    if not result[ 'OK' ]:
      return self.__cleanReturn( result )
    data = result[ 'Value' ]
    consIndex = { 'key': 0,
                  'name' : 0,
                  'callback' : 0,
                  'secret' : 0,
                  'icon' : 0 }
    for key in consIndex:
      consIndex[ key ] = data[ 'Parameters' ].find( key )
    for record in data[ 'Records' ]:
      consData = {}
      for key in consIndex:
        consData[ key ] = record[ consIndex[ key ] ]
      self.__consumers.add( consData[ 'key' ], self.CONSUMER_GRACE_TIME, consData )
    return self.__cleanReturn( result )

  def __cleanConsumerCache( self, cData ):
    consumerKey = cData[ 'key' ]
    for dc in ( self.__tokens, self.__requests ):
      cKeys = dc.getKeys()
      for cKey in cKeys:
        if cKey[0] == consumerKey:
          dc.delete( cKey )

  ##
  # Requests
  ##

  def generateRequest( self, consumerKey, callback = "" ):
    result = self.__getRPC().generateRequest( consumerKey, callback )
    if not result[ 'OK' ]:
      return self.__cleanReturn( result )
    requestData = result[ 'Value' ]
    self.__requests.add( requestData[ 'request' ], result[ 'lifeTime' ] - 5, requestData )
    return self.__cleanReturn( result )

  def getRequestData( self, request ):
    data = self.__requests.get( request )
    if data:
      return S_OK( data )
    result = self.__getRPC().getRequestData( request )
    if not result[ 'OK' ]:
      return self.__cleanReturn( result )
    self.__tokens.add( request, result[ 'lifeTime' ] - 5, result[ 'Value' ] )
    return self.__cleanReturn( result )

  def deleteRequest( self, request ):
    result = self.__getRPC().deleteRequest( request )
    if not result[ 'OK' ]:
      return self.__cleanReturn( result )
    cKeys = self.__requests.getKeys()
    for cKey in cKeys:
      if cKey[1] == request:
        self.__requests.delete( cKey )
    return self.__cleanReturn( result )

  ##
  # Verifiers
  ##

  def generateVerifier( self, consumerKey, request, userDN, userGroup, lifeTime = 3600 ):
    result = self.__getRPC().generateVerifier( consumerKey, request, userDN, userGroup, lifeTime )
    return self.__cleanReturn( result )

  def getVerifierData( self, verifier ):
    result = self.__getRPC().getVerifierData( verifier )
    return self.__cleanReturn( result )

  def deleteVerifier( self, verifier ):
    result = self.__getRPC().deleteVerifier( verifier )
    return self.__cleanReturn( result )

  def findVerifier( self, consumerKey, request ):
    result = self.__getRPC().findVerifier( consumerKey, request )
    return self.__cleanReturn( result )

  def setVerifierProperties( self, consumerKey, request, verifier,
                                 userDN, userGroup, lifeTime ):
    result = self.__getRPC().setVerifierProperties( consumerKey, request, verifier,
                                                  userDN, userGroup, lifeTime )
    return self.__cleanReturn( result )


  ##
  # Tokens
  ##

  def generateToken( self, consumerKey, request, verifier ):
    result = self.__getRPC().generateToken( consumerKey, request, verifier )
    if not result[ 'OK' ]:
      return self.__cleanReturn( result )
    tokenData = result[ 'Value' ]
    cKey = ( consumerKey, tokenData[ 'token' ] )
    self.__tokens.add( cKey, tokenData[ 'lifeTime' ] - 5, tokenData )
    return S_OK( tokenData )

  def getTokenData( self, consumerKey, token ):
    cKey = ( consumerKey, token )
    tokenData = self.__tokens.get( cKey )
    if tokenData:
      return S_OK( tokenData )
    result = self.__getRPC().getTokenData( consumerKey, token )
    if not result[ 'OK' ]:
      return self.__cleanReturn( result )
    tokenData = result[ 'Value' ]
    self.__tokens.add( cKey, tokenData[ 'lifeTime' ] - 5, tokenData )
    return self.__cleanReturn( result )

  def revokeUserToken( self, userDN, userGroup, token ):
    result = self.__getRPC().revokeUserToken( userDN, userGroup, token )
    if not result[ 'OK' ]:
      return self.__cleanReturn( result )
    cKeys = self.__tokens.getKeys()
    for cKey in cKeys:
      if cKey[0] == userDN and cKey[1] == userGroup and cKey[3] == token:
        self.__tokens.delete( cKey )
    return self.__cleanReturn( result )

  def revokeToken( self, token ):
    result = self.__getRPC().revokeToken( token )
    if not result[ 'OK' ]:
      return self.__cleanReturn( result )
    cKeys = self.__tokens.getKeys()
    for cKey in cKeys:
      if cKey[3] == token:
        self.__tokens.delete( cKey )
    return self.__cleanReturn( result )

  def cleanExpired( self ):
    return self.__getRPC().cleanExpired()

  def getTokens( self, condDict = {} ):
    result = self.__getRPC().getTokens( condDict )
    if not result[ 'OK' ]:
      return self.__cleanReturn( result )
    params = result[ 'Value' ][ 'Parameters']
    data = result[ 'Value' ][ 'Records' ]
    consumerKey = "unknown"
    token = unknown
    lifeTime = 0
    for record in data:
      tokenData = {}
      for iPos in range( len( params ) ):
        if params[iPos] == "UserDN":
          tokenData[ 'userDN' ] = record[iPos]
        elif params[iPos] == "UserGroup":
          tokenData[ 'userGroup' ] = record[iPos]
        elif params[iPos] == "ConsumerKey":
          consumerKey = record[iPos]
        elif params[iPos] == "Token":
          token = record[iPos]
        elif params[iPos] == "Secret":
          tokenData[ 'secret' ] = record[iPos]
        elif params[iPos] == "LifeTime":
          tokenData[ 'lifeTime' ] = record[iPos]
          lifeTime = record[ iPos ]
      self.__tokens.add( ( consumerKey, token ), tokenData[ 'lifeTime' ], tokenData )
    return self.__cleanReturn( result )
コード例 #7
0
ファイル: GridPilotDirector.py プロジェクト: acasajus/DIRAC
class GridPilotDirector(PilotDirector):
    """
    Base Grid PilotDirector class
    Derived classes must declare:
      self.Middleware: It must correspond to the string before "PilotDirector".
        (For proper naming of the logger)
      self.ResourceBrokers: list of Brokers used by the Director.
        (For proper error reporting)
  """
    def __init__(self, submitPool):
        """
     Define some defaults and call parent __init__
    """
        self.gridEnv = GRIDENV

        self.cpuPowerRef = CPU_POWER_REF
        self.requirements = REQUIREMENTS
        self.rank = RANK
        self.fuzzyRank = FUZZY_RANK

        self.__failingWMSCache = DictCache()
        self.__ticketsWMSCache = DictCache()
        self.__listMatchWMSCache = DictCache()

        PilotDirector.__init__(self, submitPool)

    def configure(self, csSection, submitPool):
        """
     Here goes common configuration for all Grid PilotDirectors
    """
        PilotDirector.configure(self, csSection, submitPool)
        self.reloadConfiguration(csSection, submitPool)

        self.__failingWMSCache.purgeExpired()
        self.__ticketsWMSCache.purgeExpired()
        for rb in self.__failingWMSCache.getKeys():
            if rb in self.resourceBrokers:
                try:
                    self.resourceBrokers.remove(rb)
                except:
                    pass

        self.resourceBrokers = List.randomize(self.resourceBrokers)

        if self.gridEnv:
            self.log.info(' GridEnv:        ', self.gridEnv)
        if self.resourceBrokers:
            self.log.info(' ResourceBrokers:', ', '.join(self.resourceBrokers))

    def configureFromSection(self, mySection):
        """
      reload from CS
    """
        PilotDirector.configureFromSection(self, mySection)

        self.gridEnv = gConfig.getValue(mySection + '/GridEnv', self.gridEnv)
        if not self.gridEnv:
            # No specific option found, try a general one
            setup = gConfig.getValue('/DIRAC/Setup', '')
            if setup:
                instance = gConfig.getValue(
                    '/DIRAC/Setups/%s/WorkloadManagement' % setup, '')
                if instance:
                    self.gridEnv = gConfig.getValue(
                        '/Systems/WorkloadManagement/%s/GridEnv' % instance,
                        '')

        self.resourceBrokers = gConfig.getValue(mySection + '/ResourceBrokers',
                                                self.resourceBrokers)

        self.cpuPowerRef = gConfig.getValue(mySection + '/CPUPowerRef',
                                            self.cpuPowerRef)
        self.requirements = gConfig.getValue(mySection + '/Requirements',
                                             self.requirements)
        self.rank = gConfig.getValue(mySection + '/Rank', self.rank)
        self.fuzzyRank = gConfig.getValue(mySection + '/FuzzyRank',
                                          self.fuzzyRank)

    def _submitPilots(self, workDir, taskQueueDict, pilotOptions,
                      pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ,
                      proxy, pilotsPerJob):
        """
      This method does the actual pilot submission to the Grid RB
      The logic is as follows:
      - If there are no available RB it return error
      - If there is no VOMS extension in the proxy, return error
      - It creates a temp directory
      - Prepare a JDL
        it has some part common to gLite and LCG (the payload description)
        it has some part specific to each middleware
    """
        taskQueueID = taskQueueDict['TaskQueueID']
        # ownerDN = taskQueueDict['OwnerDN']
        credDict = proxy.getCredentials()['Value']
        ownerDN = credDict['identity']
        ownerGroup = credDict['group']

        if not self.resourceBrokers:
            # Since we can exclude RBs from the list, it may become empty
            return S_ERROR(ERROR_RB)

        # Need to get VOMS extension for the later interactions with WMS
        ret = gProxyManager.getVOMSAttributes(proxy)
        if not ret['OK']:
            self.log.error(ERROR_VOMS, ret['Message'])
            return S_ERROR(ERROR_VOMS)
        if not ret['Value']:
            return S_ERROR(ERROR_VOMS)
        vomsGroup = ret['Value'][0]

        workingDirectory = tempfile.mkdtemp(prefix='TQ_%s_' % taskQueueID,
                                            dir=workDir)
        self.log.verbose('Using working Directory:', workingDirectory)

        # Write JDL
        retDict = self._prepareJDL(taskQueueDict, workingDirectory,
                                   pilotOptions, pilotsPerJob, ceMask,
                                   submitPrivatePilot, privateTQ)
        jdl = retDict['JDL']
        pilotRequirements = retDict['Requirements']
        rb = retDict['RB']
        if not jdl:
            try:
                shutil.rmtree(workingDirectory)
            except:
                pass
            return S_ERROR(ERROR_JDL)

        # Check that there are available queues for the Job:
        if self.enableListMatch:
            availableCEs = []
            now = Time.dateTime()
            availableCEs = self.listMatchCache.get(pilotRequirements)
            if availableCEs == False:
                availableCEs = self._listMatch(proxy, jdl, taskQueueID, rb)
                if availableCEs != False:
                    self.log.verbose('LastListMatch', now)
                    self.log.verbose('AvailableCEs ', availableCEs)
                    self.listMatchCache.add(
                        pilotRequirements,
                        self.listMatchDelay * 60,
                        value=availableCEs)  # it is given in minutes
            if not availableCEs:
                try:
                    shutil.rmtree(workingDirectory)
                except:
                    pass
                return S_ERROR(ERROR_CE + ' TQ: %d' % taskQueueID)

        # Now we are ready for the actual submission, so

        self.log.verbose('Submitting Pilots for TaskQueue', taskQueueID)
        submitRet = self._submitPilot(proxy, pilotsPerJob, jdl, taskQueueID,
                                      rb)
        try:
            shutil.rmtree(workingDirectory)
        except:
            pass
        if not submitRet:
            return S_ERROR('Pilot Submission Failed for TQ %d ' % taskQueueID)
        # pilotReference, resourceBroker = submitRet

        submittedPilots = 0

        if pilotsPerJob != 1 and len(submitRet) != pilotsPerJob:
            # Parametric jobs are used
            for pilotReference, resourceBroker in submitRet:
                pilotReference = self._getChildrenReferences(
                    proxy, pilotReference, taskQueueID)
                submittedPilots += len(pilotReference)
                pilotAgentsDB.addPilotTQReference(pilotReference, taskQueueID,
                                                  ownerDN, ownerGroup,
                                                  resourceBroker,
                                                  self.gridMiddleware,
                                                  pilotRequirements)
        else:
            for pilotReference, resourceBroker in submitRet:
                pilotReference = [pilotReference]
                submittedPilots += len(pilotReference)
                pilotAgentsDB.addPilotTQReference(pilotReference, taskQueueID,
                                                  ownerDN, ownerGroup,
                                                  resourceBroker,
                                                  self.gridMiddleware,
                                                  pilotRequirements)

        # add some sleep here
        time.sleep(0.1 * submittedPilots)

        if pilotsToSubmit > pilotsPerJob:
            # Additional submissions are necessary, need to get a new token and iterate.
            pilotsToSubmit -= pilotsPerJob
            result = gProxyManager.requestToken(
                ownerDN, ownerGroup, max(pilotsToSubmit,
                                         self.maxJobsInFillMode))
            if not result['OK']:
                self.log.error(ERROR_TOKEN, result['Message'])
                result = S_ERROR(ERROR_TOKEN)
                result['Value'] = submittedPilots
                return result
            (token, numberOfUses) = result['Value']
            for option in pilotOptions:
                if option.find('-o /Security/ProxyToken=') == 0:
                    pilotOptions.remove(option)
            pilotOptions.append('-o /Security/ProxyToken=%s' % token)
            pilotsPerJob = max(
                1, min(pilotsPerJob,
                       int(numberOfUses / self.maxJobsInFillMode)))
            result = self._submitPilots(workDir, taskQueueDict, pilotOptions,
                                        pilotsToSubmit, ceMask,
                                        submitPrivatePilot, privateTQ, proxy,
                                        pilotsPerJob)
            if not result['OK']:
                if 'Value' not in result:
                    result['Value'] = 0
                result['Value'] += submittedPilots
                return result
            submittedPilots += result['Value']

        return S_OK(submittedPilots)

    def _prepareJDL(self, taskQueueDict, workingDirectory, pilotOptions,
                    pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ):
        """
      This method should be overridden in a subclass
    """
        self.log.error(
            '_prepareJDL() method should be implemented in a subclass')
        sys.exit()

    def _JobJDL(self, taskQueueDict, pilotOptions, ceMask):
        """
     The Job JDL is the same for LCG and GLite
    """
        pilotJDL = 'Executable     = "%s";\n' % os.path.basename(self.pilot)
        executable = self.pilot

        pilotJDL += 'Arguments     = "%s";\n' % ' '.join(pilotOptions)

        pilotJDL += 'CPUTimeRef    = %s;\n' % taskQueueDict['CPUTime']

        pilotJDL += 'CPUPowerRef   = %s;\n' % self.cpuPowerRef

        pilotJDL += """CPUWorkRef    = real( CPUTimeRef * CPUPowerRef );

Lookup        = "CPUScalingReferenceSI00=*";
cap = isList( other.GlueCECapability ) ? other.GlueCECapability : { "dummy" };
i0 = regexp( Lookup, cap[0] ) ? 0 : undefined;
i1 = isString( cap[1] ) && regexp( Lookup, cap[1] ) ? 1 : i0;
i2 = isString( cap[2] ) && regexp( Lookup, cap[2] ) ? 2 : i1;
i3 = isString( cap[3] ) && regexp( Lookup, cap[3] ) ? 3 : i2;
i4 = isString( cap[4] ) && regexp( Lookup, cap[4] ) ? 4 : i3;
i5 = isString( cap[5] ) && regexp( Lookup, cap[5] ) ? 5 : i4;
index = isString( cap[6] ) && regexp( Lookup, cap[6] ) ? 6 : i5;
i = isUndefined( index ) ? 0 : index;

QueuePowerRef = real( ! isUndefined( index ) ? int( substr( cap[i], size( Lookup ) - 1 ) ) : other.GlueHostBenchmarkSI00 );
QueueTimeRef  = real( other.GlueCEPolicyMaxCPUTime * 60 );
QueueWorkRef  = QueuePowerRef * QueueTimeRef;
"""

        requirements = list(self.requirements)
        if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']:
            # if there an explicit Grig CE requested by the TQ, remove the Ranking requirement
            for req in self.requirements:
                if req.strip().lower()[:6] == 'rank >':
                    requirements.remove(req)

        requirements.append('QueueWorkRef > CPUWorkRef')

        siteRequirements = '\n || '.join(
            ['other.GlueCEInfoHostName == "%s"' % s for s in ceMask])
        requirements.append("( %s\n )" % siteRequirements)

        pilotRequirements = '\n && '.join(requirements)

        pilotJDL += 'pilotRequirements  = %s;\n' % pilotRequirements

        pilotJDL += 'Rank          = %s;\n' % self.rank
        pilotJDL += 'FuzzyRank     = %s;\n' % self.fuzzyRank
        pilotJDL += 'StdOutput     = "%s";\n' % outputSandboxFiles[0]
        pilotJDL += 'StdError      = "%s";\n' % outputSandboxFiles[1]

        pilotJDL += 'InputSandbox  = { "%s" };\n' % '", "'.join(
            [self.install, executable])

        pilotJDL += 'OutputSandbox = { %s };\n' % ', '.join(
            ['"%s"' % f for f in outputSandboxFiles])

        self.log.verbose(pilotJDL)

        return (pilotJDL, pilotRequirements)

    def parseListMatchStdout(self, proxy, cmd, taskQueueID, rb):
        """
      Parse List Match stdout to return list of matched CE's
    """
        self.log.verbose('Executing List Match for TaskQueue', taskQueueID)

        start = time.time()
        ret = executeGridCommand(proxy, cmd, self.gridEnv)

        if not ret['OK']:
            self.log.error('Failed to execute List Match:', ret['Message'])
            self.__sendErrorMail(rb, 'List Match', cmd, ret, proxy)
            return False
        if ret['Value'][0] != 0:
            self.log.error('Error executing List Match:',
                           str(ret['Value'][0]) + '\n'.join(ret['Value'][1:3]))
            self.__sendErrorMail(rb, 'List Match', cmd, ret, proxy)
            return False
        self.log.info('List Match Execution Time: %.2f for TaskQueue %d' %
                      ((time.time() - start), taskQueueID))

        stdout = ret['Value'][1]
        stderr = ret['Value'][2]
        availableCEs = []
        # Parse std.out
        for line in List.fromChar(stdout, '\n'):
            if re.search('/jobmanager-', line) or re.search('/cream-', line):
                # TODO: the line has to be stripped from extra info
                availableCEs.append(line)

        if not availableCEs:
            self.log.info('List-Match failed to find CEs for TaskQueue',
                          taskQueueID)
            self.log.info(stdout)
            self.log.info(stderr)
        else:
            self.log.debug('List-Match returns:',
                           str(ret['Value'][0]) + '\n'.join(ret['Value'][1:3]))
            self.log.info(
                'List-Match found %s CEs for TaskQueue' % len(availableCEs),
                taskQueueID)
            self.log.verbose(', '.join(availableCEs))

        return availableCEs

    def parseJobSubmitStdout(self, proxy, cmd, taskQueueID, rb):
        """
      Parse Job Submit stdout to return pilot reference
    """
        start = time.time()
        self.log.verbose('Executing Job Submit for TaskQueue', taskQueueID)

        ret = executeGridCommand(proxy, cmd, self.gridEnv)

        if not ret['OK']:
            self.log.error('Failed to execute Job Submit:', ret['Message'])
            self.__sendErrorMail(rb, 'Job Submit', cmd, ret, proxy)
            return False
        if ret['Value'][0] != 0:
            self.log.error('Error executing Job Submit:',
                           str(ret['Value'][0]) + '\n'.join(ret['Value'][1:3]))
            self.__sendErrorMail(rb, 'Job Submit', cmd, ret, proxy)
            return False
        self.log.info('Job Submit Execution Time: %.2f for TaskQueue %d' %
                      ((time.time() - start), taskQueueID))

        stdout = ret['Value'][1]
        stderr = ret['Value'][2]

        submittedPilot = None

        failed = 1
        rb = ''
        for line in List.fromChar(stdout, '\n'):
            m = re.search("(https:\S+)", line)
            if (m):
                glite_id = m.group(1)
                submittedPilot = glite_id
                if not rb:
                    m = re.search("https://(.+):.+", glite_id)
                    rb = m.group(1)
                failed = 0
        if failed:
            self.log.error('Job Submit returns no Reference:',
                           str(ret['Value'][0]) + '\n'.join(ret['Value'][1:3]))
            return False

        self.log.info('Reference %s for TaskQueue %s' %
                      (glite_id, taskQueueID))

        return glite_id, rb

    def _writeJDL(self, filename, jdlList):
        try:
            f = open(filename, 'w')
            f.write('\n'.join(jdlList))
            f.close()
        except Exception, x:
            self.log.exception()
            return ''

        return filename
コード例 #8
0
class Cache( object ):
  """
    Cache basic class.
    
    WARNING: None of its methods is thread safe. Acquire / Release lock when
    using them !
  """
  
  def __init__( self, lifeTime, updateFunc ):
    """
    Constructor
    
    :Parameters:
      **lifeTime** - `int`
        Lifetime of the elements in the cache ( seconds ! )
      **updateFunc** - `function`
        This function MUST return a S_OK | S_ERROR object. In the case of the first,
        its value must be a dictionary.
    
    """
    
    # We set a 20% of the lifetime randomly, so that if we have thousands of jobs
    # starting at the same time, all the caches will not end at the same time.
    randomLifeTimeBias  = 0.2 * random.random()
    
    self.log            = gLogger.getSubLogger( self.__class__.__name__ )
    
    self.__lifeTime     = int( lifeTime * ( 1 + randomLifeTimeBias ) )
    self.__updateFunc   = updateFunc
    # The records returned from the cache must be valid at least 10 seconds.
    self.__validSeconds = 10
    
    # Cache
    self.__cache       = DictCache()
    self.__cacheLock   = LockRing()
    self.__cacheLock.getLock( self.__class__.__name__ )
  
  #.............................................................................
  # internal cache object getter
  
  def cacheKeys( self ):
    """
    Cache keys getter
      
    :returns: list with valid keys on the cache
    """
    
    return self.__cache.getKeys( validSeconds = self.__validSeconds ) 

  #.............................................................................
  # acquire / release Locks

  def acquireLock( self ):
    """
    Acquires Cache lock
    """
    
    self.__cacheLock.acquire( self.__class__.__name__ )

  def releaseLock( self ):
    """
    Releases Cache lock
    """
    
    self.__cacheLock.release( self.__class__.__name__)
  
  #.............................................................................
  # Cache getters

  def get( self, cacheKeys ):
    """
    Gets values for cacheKeys given, if all are found ( present on the cache and
    valid ), returns S_OK with the results. If any is not neither present not
    valid, returns S_ERROR. 
    
    :Parameters:
      **cacheKeys** - `list`
        list of keys to be extracted from the cache
        
    :return: S_OK | S_ERROR
    """

    result = {}

    for cacheKey in cacheKeys:

      cacheRow = self.__cache.get( cacheKey, validSeconds = self.__validSeconds )
      if not cacheRow:
        self.log.error( str( cacheKey ) )
        return S_ERROR( 'Cannot get %s' % str( cacheKey ) )
      result.update( { cacheKey : cacheRow } )
      
    return S_OK( result )

  #.............................................................................
  # Cache refreshers

  def refreshCache( self ):
    """     
    Purges the cache and gets fresh data from the update function.
    
    :return: S_OK | S_ERROR. If the first, its content is the new cache.    
    """

    self.log.verbose( 'refreshing...' )
    
    self.__cache.purgeAll()
    
    newCache = self.__updateFunc()
    if not newCache[ 'OK' ]:
      self.log.error( newCache[ 'Message' ] )
      return newCache
    
    newCache = self.__updateCache( newCache[ 'Value' ] )
    
    self.log.verbose( 'refreshed' )
    
    return newCache

  #.............................................................................
  # Private methods    
     
  def __updateCache( self, newCache ):
    """
    Given the new cache dictionary, updates the internal cache with it. It sets
    a duration to the entries of <self.__lifeTime> seconds.
    
    :Parameters:
      **newCache** - `dict`
        dictionary containing a new cache
    
    :return: dictionary. It is newCache argument.    
    """
    
    for cacheKey, cacheValue in newCache.items():
      self.__cache.add( cacheKey, self.__lifeTime, value = cacheValue )
    
    # We are assuming nothing will fail while inserting in the cache. There is
    # no apparent reason to suspect from that piece of code.     
    return S_OK( newCache )
コード例 #9
0
ファイル: GridPilotDirector.py プロジェクト: sbel/bes3-jinr
class GridPilotDirector( PilotDirector ):
  """
    Base Grid PilotDirector class
    Derived classes must declare:
      self.Middleware: It must correspond to the string before "PilotDirector".
        (For proper naming of the logger)
      self.ResourceBrokers: list of Brokers used by the Director.
        (For proper error reporting)
  """
  def __init__( self, submitPool ):
    """
     Define some defaults and call parent __init__
    """
    self.gridEnv = GRIDENV

    self.cpuPowerRef = CPU_POWER_REF
    self.requirements = REQUIREMENTS
    self.rank = RANK
    self.fuzzyRank = FUZZY_RANK

    self.__failingWMSCache = DictCache()
    self.__ticketsWMSCache = DictCache()
    self.__listMatchWMSCache = DictCache()

    PilotDirector.__init__( self, submitPool )

  def configure( self, csSection, submitPool ):
    """
     Here goes common configuration for all Grid PilotDirectors
    """
    PilotDirector.configure( self, csSection, submitPool )
    self.reloadConfiguration( csSection, submitPool )

    self.__failingWMSCache.purgeExpired()
    self.__ticketsWMSCache.purgeExpired()
    for rb in self.__failingWMSCache.getKeys():
      if rb in self.resourceBrokers:
        try:
          self.resourceBrokers.remove( rb )
        except:
          pass

    self.resourceBrokers = List.randomize( self.resourceBrokers )

    if self.gridEnv:
      self.log.info( ' GridEnv:        ', self.gridEnv )
    if self.resourceBrokers:
      self.log.info( ' ResourceBrokers:', ', '.join( self.resourceBrokers ) )

  def configureFromSection( self, mySection ):
    """
      reload from CS
    """
    PilotDirector.configureFromSection( self, mySection )

    self.gridEnv = gConfig.getValue( mySection + '/GridEnv', self.gridEnv )
    if not self.gridEnv:
      # No specific option found, try a general one
      setup = gConfig.getValue( '/DIRAC/Setup', '' )
      if setup:
        instance = gConfig.getValue( '/DIRAC/Setups/%s/WorkloadManagement' % setup, '' )
        if instance:
          self.gridEnv = gConfig.getValue( '/Systems/WorkloadManagement/%s/GridEnv' % instance, '' )

    self.resourceBrokers = gConfig.getValue( mySection + '/ResourceBrokers'      , self.resourceBrokers )

    self.cpuPowerRef = gConfig.getValue( mySection + '/CPUPowerRef'           , self.cpuPowerRef )
    self.requirements = gConfig.getValue( mySection + '/Requirements'         , self.requirements )
    self.rank = gConfig.getValue( mySection + '/Rank'                 , self.rank )
    self.fuzzyRank = gConfig.getValue( mySection + '/FuzzyRank'            , self.fuzzyRank )

  def _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit,
                     ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ):
    """
      This method does the actual pilot submission to the Grid RB
      The logic is as follows:
      - If there are no available RB it return error
      - If there is no VOMS extension in the proxy, return error
      - It creates a temp directory
      - Prepare a JDL
        it has some part common to gLite and LCG (the payload description)
        it has some part specific to each middleware
    """
    taskQueueID = taskQueueDict['TaskQueueID']
    # ownerDN = taskQueueDict['OwnerDN']
    credDict = proxy.getCredentials()['Value']
    ownerDN = credDict['identity']
    ownerGroup = credDict[ 'group' ]

    if not self.resourceBrokers:
      # Since we can exclude RBs from the list, it may become empty
      return S_ERROR( ERROR_RB )

    # Need to get VOMS extension for the later interactions with WMS
    ret = gProxyManager.getVOMSAttributes( proxy )
    if not ret['OK']:
      self.log.error( ERROR_VOMS, ret['Message'] )
      return S_ERROR( ERROR_VOMS )
    if not ret['Value']:
      return S_ERROR( ERROR_VOMS )
    vomsGroup = ret['Value'][0]

    workingDirectory = tempfile.mkdtemp( prefix = 'TQ_%s_' % taskQueueID, dir = workDir )
    self.log.verbose( 'Using working Directory:', workingDirectory )

    # Write JDL
    retDict = self._prepareJDL( taskQueueDict, workingDirectory, pilotOptions, pilotsPerJob,
                                ceMask, submitPrivatePilot, privateTQ )
    jdl = retDict['JDL']
    pilotRequirements = retDict['Requirements']
    rb = retDict['RB']
    if not jdl:
      try:
        shutil.rmtree( workingDirectory )
      except:
        pass
      return S_ERROR( ERROR_JDL )

    # Check that there are available queues for the Job:
    if self.enableListMatch:
      availableCEs = []
      now = Time.dateTime()
      availableCEs = self.listMatchCache.get( pilotRequirements )
      if availableCEs == False:
        availableCEs = self._listMatch( proxy, jdl, taskQueueID, rb )
        if availableCEs != False:
          self.log.verbose( 'LastListMatch', now )
          self.log.verbose( 'AvailableCEs ', availableCEs )
          self.listMatchCache.add( pilotRequirements, self.listMatchDelay * 60,
                                   value = availableCEs )                      # it is given in minutes
      if not availableCEs:
        try:
          shutil.rmtree( workingDirectory )
        except:
          pass
        return S_ERROR( ERROR_CE + ' TQ: %d' % taskQueueID )

    # Now we are ready for the actual submission, so

    self.log.verbose( 'Submitting Pilots for TaskQueue', taskQueueID )
    submitRet = self._submitPilot( proxy, pilotsPerJob, jdl, taskQueueID, rb )
    try:
      shutil.rmtree( workingDirectory )
    except:
      pass
    if not submitRet:
      return S_ERROR( 'Pilot Submission Failed for TQ %d ' % taskQueueID )
    # pilotReference, resourceBroker = submitRet

    submittedPilots = 0

    if pilotsPerJob != 1 and len( submitRet ) != pilotsPerJob:
      # Parametric jobs are used
      for pilotReference, resourceBroker in submitRet:
        pilotReference = self._getChildrenReferences( proxy, pilotReference, taskQueueID )
        submittedPilots += len( pilotReference )
        pilotAgentsDB.addPilotTQReference( pilotReference, taskQueueID, ownerDN,
                      ownerGroup, resourceBroker, self.gridMiddleware,
                      pilotRequirements )
    else:
      for pilotReference, resourceBroker in submitRet:
        pilotReference = [pilotReference]
        submittedPilots += len( pilotReference )
        pilotAgentsDB.addPilotTQReference( pilotReference, taskQueueID, ownerDN,
                      ownerGroup, resourceBroker, self.gridMiddleware, pilotRequirements )

    # add some sleep here
    time.sleep( 0.1 * submittedPilots )

    if pilotsToSubmit > pilotsPerJob:
      # Additional submissions are necessary, need to get a new token and iterate.
      pilotsToSubmit -= pilotsPerJob
      result = gProxyManager.requestToken( ownerDN, ownerGroup, max( pilotsToSubmit, self.maxJobsInFillMode ) )
      if not result[ 'OK' ]:
        self.log.error( ERROR_TOKEN, result['Message'] )
        result = S_ERROR( ERROR_TOKEN )
        result['Value'] = submittedPilots
        return result
      ( token, numberOfUses ) = result[ 'Value' ]
      for option in pilotOptions:
        if option.find( '-o /Security/ProxyToken=' ) == 0:
          pilotOptions.remove( option )
      pilotOptions.append( '-o /Security/ProxyToken=%s' % token )
      pilotsPerJob = max( 1, min( pilotsPerJob, int( numberOfUses / self.maxJobsInFillMode ) ) )
      result = self._submitPilots( workDir, taskQueueDict, pilotOptions,
                                   pilotsToSubmit, ceMask,
                                   submitPrivatePilot, privateTQ,
                                   proxy, pilotsPerJob )
      if not result['OK']:
        if 'Value' not in result:
          result['Value'] = 0
        result['Value'] += submittedPilots
        return result
      submittedPilots += result['Value']

    return S_OK( submittedPilots )

  def _prepareJDL( self, taskQueueDict, workingDirectory, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ ):
    """
      This method should be overridden in a subclass
    """
    self.log.error( '_prepareJDL() method should be implemented in a subclass' )
    sys.exit()

  def _JobJDL( self, taskQueueDict, pilotOptions, ceMask ):
    """
     The Job JDL is the same for LCG and GLite
    """
    pilotJDL = 'Executable     = "%s";\n' % os.path.basename( self.pilot )
    executable = self.pilot

    pilotJDL += 'Arguments     = "%s";\n' % ' '.join( pilotOptions )

    pilotJDL += 'CPUTimeRef    = %s;\n' % taskQueueDict['CPUTime']

    pilotJDL += 'CPUPowerRef   = %s;\n' % self.cpuPowerRef

    pilotJDL += """CPUWorkRef    = real( CPUTimeRef * CPUPowerRef );

Lookup        = "CPUScalingReferenceSI00=*";
cap = isList( other.GlueCECapability ) ? other.GlueCECapability : { "dummy" };
i0 = regexp( Lookup, cap[0] ) ? 0 : undefined;
i1 = isString( cap[1] ) && regexp( Lookup, cap[1] ) ? 1 : i0;
i2 = isString( cap[2] ) && regexp( Lookup, cap[2] ) ? 2 : i1;
i3 = isString( cap[3] ) && regexp( Lookup, cap[3] ) ? 3 : i2;
i4 = isString( cap[4] ) && regexp( Lookup, cap[4] ) ? 4 : i3;
i5 = isString( cap[5] ) && regexp( Lookup, cap[5] ) ? 5 : i4;
index = isString( cap[6] ) && regexp( Lookup, cap[6] ) ? 6 : i5;
i = isUndefined( index ) ? 0 : index;

QueuePowerRef = real( ! isUndefined( index ) ? int( substr( cap[i], size( Lookup ) - 1 ) ) : other.GlueHostBenchmarkSI00 );
QueueTimeRef  = real( other.GlueCEPolicyMaxCPUTime * 60 );
QueueWorkRef  = QueuePowerRef * QueueTimeRef;
"""

    requirements = list( self.requirements )
    if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']:
      # if there an explicit Grig CE requested by the TQ, remove the Ranking requirement
      for req in self.requirements:
        if req.strip().lower()[:6] == 'rank >':
          requirements.remove( req )

    requirements.append( 'QueueWorkRef > CPUWorkRef' )

    siteRequirements = '\n || '.join( [ 'other.GlueCEInfoHostName == "%s"' % s for s in ceMask ] )
    requirements.append( "( %s\n )" % siteRequirements )

    pilotRequirements = '\n && '.join( requirements )

    pilotJDL += 'pilotRequirements  = %s;\n' % pilotRequirements

    pilotJDL += 'Rank          = %s;\n' % self.rank
    pilotJDL += 'FuzzyRank     = %s;\n' % self.fuzzyRank
    pilotJDL += 'StdOutput     = "%s";\n' % outputSandboxFiles[0]
    pilotJDL += 'StdError      = "%s";\n' % outputSandboxFiles[1]

    pilotJDL += 'InputSandbox  = { "%s" };\n' % '", "'.join( [ self.install, executable ] )

    pilotJDL += 'OutputSandbox = { %s };\n' % ', '.join( [ '"%s"' % f for f in outputSandboxFiles ] )

    self.log.verbose( pilotJDL )

    return ( pilotJDL, pilotRequirements )


  def parseListMatchStdout( self, proxy, cmd, taskQueueID, rb ):
    """
      Parse List Match stdout to return list of matched CE's
    """
    self.log.verbose( 'Executing List Match for TaskQueue', taskQueueID )

    start = time.time()
    ret = executeGridCommand( proxy, cmd, self.gridEnv )

    if not ret['OK']:
      self.log.error( 'Failed to execute List Match:', ret['Message'] )
      self.__sendErrorMail( rb, 'List Match', cmd, ret, proxy )
      return False
    if ret['Value'][0] != 0:
      self.log.error( 'Error executing List Match:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
      self.__sendErrorMail( rb, 'List Match', cmd, ret, proxy )
      return False
    self.log.info( 'List Match Execution Time: %.2f for TaskQueue %d' % ( ( time.time() - start ), taskQueueID ) )

    stdout = ret['Value'][1]
    stderr = ret['Value'][2]
    availableCEs = []
    # Parse std.out
    for line in List.fromChar( stdout, '\n' ):
      if re.search( '/jobmanager-', line ) or re.search( '/cream-', line ):
        # TODO: the line has to be stripped from extra info
        availableCEs.append( line )

    if not availableCEs:
      self.log.info( 'List-Match failed to find CEs for TaskQueue', taskQueueID )
      self.log.info( stdout )
      self.log.info( stderr )
    else:
      self.log.debug( 'List-Match returns:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
      self.log.info( 'List-Match found %s CEs for TaskQueue' % len( availableCEs ), taskQueueID )
      self.log.verbose( ', '.join( availableCEs ) )


    return availableCEs

  def parseJobSubmitStdout( self, proxy, cmd, taskQueueID, rb ):
    """
      Parse Job Submit stdout to return pilot reference
    """
    start = time.time()
    self.log.verbose( 'Executing Job Submit for TaskQueue', taskQueueID )

    ret = executeGridCommand( proxy, cmd, self.gridEnv )

    if not ret['OK']:
      self.log.error( 'Failed to execute Job Submit:', ret['Message'] )
      self.__sendErrorMail( rb, 'Job Submit', cmd, ret, proxy )
      return False
    if ret['Value'][0] != 0:
      self.log.error( 'Error executing Job Submit:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
      self.__sendErrorMail( rb, 'Job Submit', cmd, ret, proxy )
      return False
    self.log.info( 'Job Submit Execution Time: %.2f for TaskQueue %d' % ( ( time.time() - start ), taskQueueID ) )

    stdout = ret['Value'][1]
    stderr = ret['Value'][2]

    submittedPilot = None

    failed = 1
    rb = ''
    for line in List.fromChar( stdout, '\n' ):
      m = re.search( "(https:\S+)", line )
      if ( m ):
        glite_id = m.group( 1 )
        submittedPilot = glite_id
        if not rb:
          m = re.search( "https://(.+):.+", glite_id )
          rb = m.group( 1 )
        failed = 0
    if failed:
      self.log.error( 'Job Submit returns no Reference:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
      return False

    self.log.info( 'Reference %s for TaskQueue %s' % ( glite_id, taskQueueID ) )

    return glite_id, rb

  def _writeJDL( self, filename, jdlList ):
    try:
      f = open( filename, 'w' )
      f.write( '\n'.join( jdlList ) )
      f.close()
    except Exception, x:
      self.log.exception()
      return ''

    return filename
コード例 #10
0
class Cache:
    """
    Cache basic class.

    WARNING: None of its methods is thread safe. Acquire / Release lock when
    using them !
    """
    def __init__(self, lifeTime, updateFunc):
        """
        Constructor

        :Parameters:
          **lifeTime** - `int`
            Lifetime of the elements in the cache ( seconds ! )
          **updateFunc** - `function`
            This function MUST return a S_OK | S_ERROR object. In the case of the first,
            its value must be a dictionary.

        """

        # We set a 20% of the lifetime randomly, so that if we have thousands of jobs
        # starting at the same time, all the caches will not end at the same time.
        randomLifeTimeBias = 0.2 * random.random()

        self.log = gLogger.getSubLogger(self.__class__.__name__)

        self.__lifeTime = int(lifeTime * (1 + randomLifeTimeBias))
        self.__updateFunc = updateFunc
        # The records returned from the cache must be valid at least 30 seconds.
        self.__validSeconds = 30

        # Cache
        self.__cache = DictCache()
        self.__cacheLock = LockRing()
        self.__cacheLock.getLock(self.__class__.__name__)

    # internal cache object getter

    def cacheKeys(self):
        """
        Cache keys getter

        :returns: list with keys in the cache valid for at least twice the validity period of the element
        """

        # Here we need to have more than the validity period because of the logic of the matching:
        # * get all the keys with validity T
        # * for each key K, get the element K with validity T
        # This logic fails for elements just at the limit of the required time
        return self.__cache.getKeys(validSeconds=self.__validSeconds * 2)

    # acquire / release Locks

    def acquireLock(self):
        """
        Acquires Cache lock
        """
        self.__cacheLock.acquire(self.__class__.__name__)

    def releaseLock(self):
        """
        Releases Cache lock
        """
        self.__cacheLock.release(self.__class__.__name__)

    # Cache getters

    def get(self, cacheKeys):
        """
        Gets values for cacheKeys given, if all are found ( present on the cache and
        valid ), returns S_OK with the results. If any is not neither present not
        valid, returns S_ERROR.

        :Parameters:
          **cacheKeys** - `list`
            list of keys to be extracted from the cache

        :return: S_OK | S_ERROR
        """

        result = {}

        for cacheKey in cacheKeys:
            cacheRow = self.__cache.get(cacheKey,
                                        validSeconds=self.__validSeconds)

            if not cacheRow:
                return S_ERROR("Cannot get %s" % str(cacheKey))
            result.update({cacheKey: cacheRow})

        return S_OK(result)

    def check(self, cacheKeys, vO):
        """
        Modified get() method. Attempts to find keys with a vO value appended or 'all'
        value appended. The cacheKeys passed in are 'flattened' cache keys (no vO)
        Gets values for cacheKeys given, if all are found ( present on the cache and
        valid ), returns S_OK with the results. If any is not neither present not
        valid, returns S_ERROR.

        :Parameters:
          **cacheKeys** - `list`
            list of keys to be extracted from the cache

        :return: S_OK | S_ERROR
        """

        result = {}

        for cacheKey in cacheKeys:
            longCacheKey = cacheKey + ("all", )
            cacheRow = self.__cache.get(longCacheKey,
                                        validSeconds=self.__validSeconds)
            if not cacheRow:
                longCacheKey = cacheKey + (vO, )
                cacheRow = self.__cache.get(longCacheKey,
                                            validSeconds=self.__validSeconds)
                if not cacheRow:
                    return S_ERROR(
                        'Cannot get extended %s (neither for VO = %s nor for "all" Vos)'
                        % (str(cacheKey), vO))
            result.update({longCacheKey: cacheRow})

        return S_OK(result)

    # Cache refreshers

    def refreshCache(self):
        """
        Purges the cache and gets fresh data from the update function.

        :return: S_OK | S_ERROR. If the first, its content is the new cache.
        """

        self.log.verbose("refreshing...")

        self.__cache.purgeAll()

        newCache = self.__updateFunc()
        if not newCache["OK"]:
            self.log.error(newCache["Message"])
            return newCache

        newCache = self.__updateCache(newCache["Value"])

        self.log.verbose("refreshed")

        return newCache

    # Private methods

    def __updateCache(self, newCache):
        """
        Given the new cache dictionary, updates the internal cache with it. It sets
        a duration to the entries of <self.__lifeTime> seconds.

        :Parameters:
          **newCache** - `dict`
            dictionary containing a new cache

        :return: dictionary. It is newCache argument.
        """

        for cacheKey, cacheValue in newCache.items():
            self.__cache.add(cacheKey, self.__lifeTime, value=cacheValue)

        # We are assuming nothing will fail while inserting in the cache. There is
        # no apparent reason to suspect from that piece of code.
        return S_OK(newCache)