Example #1
0
  def __init__( self, submitPool ):
    """
     Define some defaults and call parent __init__
    """
    self.gridEnv = GRIDENV

    self.cpuPowerRef = CPU_POWER_REF
    self.requirements = REQUIREMENTS
    self.rank = RANK
    self.fuzzyRank = FUZZY_RANK

    self.__failingWMSCache = DictCache()
    self.__ticketsWMSCache = DictCache()
    self.__listMatchWMSCache = DictCache()

    PilotDirector.__init__( self, submitPool )
Example #2
0
  def __init__( self, submitPool ):
    """
     Define some defaults and call parent __init__
    """
    self.gridEnv = GRIDENV

    self.cpuPowerRef = CPU_POWER_REF
    self.requirements = REQUIREMENTS
    self.rank = RANK
    self.fuzzyRank = FUZZY_RANK

    self.__failingWMSCache = DictCache()
    self.__ticketsWMSCache = DictCache()
    self.__listMatchWMSCache = DictCache()

    PilotDirector.__init__( self, submitPool )
  def __init__( self, submitPool ):
    """
     Define some defaults and call parent __init__
    """
    self.gridMiddleware    = 'DIRAC'

    PilotDirector.__init__( self, submitPool )

    self.computingElementList = COMPUTING_ELEMENTS
    self.computingElementDict = {}
    self.addComputingElement( self.computingElementList )

    self.siteName          = gConfig.getValue('/LocalSite/Site','')
    if not self.siteName:
      self.log.error( 'Can not run a Director if Site Name is not defined' )
      sys.exit()

    self.__failingCECache  = DictCache()
    self.__ticketsCECache  = DictCache()
Example #4
0
    def __init__(self, submitPool):
        """
     Define some defaults and call parent __init__
    """
        self.gridMiddleware = 'DIRAC'

        PilotDirector.__init__(self, submitPool)

        self.computingElementList = COMPUTING_ELEMENTS
        self.computingElementDict = {}
        self.addComputingElement(self.computingElementList)

        self.siteName = gConfig.getValue('/LocalSite/Site', '')
        if not self.siteName:
            self.log.error(
                'Can not run a Director if Site Name is not defined')
            sys.exit()

        self.__failingCECache = DictCache()
        self.__ticketsCECache = DictCache()
Example #5
0
    def __init__(self, submitPool):
        """
     Define the logger and some defaults
    """

        if submitPool == self.gridMiddleware:
            self.log = gLogger.getSubLogger('%sPilotDirector' %
                                            self.gridMiddleware)
        else:
            self.log = gLogger.getSubLogger('%sPilotDirector/%s' %
                                            (self.gridMiddleware, submitPool))

        self.pilot = DIRAC_PILOT
        self.extraPilotOptions = []
        setup = gConfig.getValue('/DIRAC/Setup', '')
        vo = getVO()
        self.installVersion = gConfig.getValue(
            '/Operations/%s/%s/Versions/PilotVersion' % (vo, setup),
            DIRAC_VERSION)
        self.installProject = gConfig.getValue(
            '/Operations/%s/%s/Versions/PilotProject' % (vo, setup), "")
        self.install = DIRAC_INSTALL
        self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE

        self.genericPilotDN = PILOT_DN
        self.genericPilotGroup = PILOT_GROUP
        self.enableListMatch = ENABLE_LISTMATCH
        self.listMatchDelay = LISTMATCH_DELAY
        self.listMatchCache = DictCache()

        self.privatePilotFraction = PRIVATE_PILOT_FRACTION

        self.errorClearTime = ERROR_CLEAR_TIME
        self.errorTicketTime = ERROR_TICKET_TIME
        self.errorMailAddress = DIRAC.errorMail
        self.alarmMailAddress = DIRAC.alarmMail
        self.mailFromAddress = FROM_MAIL

        if not 'log' in self.__dict__:
            self.log = gLogger.getSubLogger('PilotDirector')
        self.log.info('Initialized')
Example #6
0
    def __init__(self, submitPool):
        """
     Define the logger and some defaults
    """

        if submitPool == self.gridMiddleware:
            self.log = gLogger.getSubLogger('%sPilotDirector' %
                                            self.gridMiddleware)
        else:
            self.log = gLogger.getSubLogger('%sPilotDirector/%s' %
                                            (self.gridMiddleware, submitPool))

        self.pilot = DIRAC_PILOT
        self.submitPoolOption = '-o /Resources/Computing/CEDefaults/SubmitPool=%s' % submitPool
        self.extraPilotOptions = []
        self.installVersion = DIRAC_VERSION
        self.installProject = DIRAC_PROJECT
        self.installation = DIRAC_INSTALLATION

        self.virtualOrganization = VIRTUAL_ORGANIZATION
        self.install = DIRAC_INSTALL
        self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE
        self.targetGrids = [self.gridMiddleware]

        self.enableListMatch = ENABLE_LISTMATCH
        self.listMatchDelay = LISTMATCH_DELAY
        self.listMatchCache = DictCache()

        self.privatePilotFraction = PRIVATE_PILOT_FRACTION

        self.errorClearTime = ERROR_CLEAR_TIME
        self.errorTicketTime = ERROR_TICKET_TIME
        self.errorMailAddress = DIRAC.errorMail
        self.alarmMailAddress = DIRAC.alarmMail
        self.mailFromAddress = FROM_MAIL

        if not 'log' in self.__dict__:
            self.log = gLogger.getSubLogger('PilotDirector')
        self.log.info('Initialized')
Example #7
0
  def __init__( self, submitPool ):
    """
     Define the logger and some defaults
    """

    if submitPool == self.gridMiddleware:
      self.log = gLogger.getSubLogger( '%sPilotDirector' % self.gridMiddleware )
    else:
      self.log = gLogger.getSubLogger( '%sPilotDirector/%s' % ( self.gridMiddleware, submitPool ) )

    self.pilot = DIRAC_PILOT
    self.extraPilotOptions = []
    setup = gConfig.getValue( '/DIRAC/Setup', '' )
    vo = getVO()
    self.installVersion = gConfig.getValue( '/Operations/%s/%s/Versions/PilotVersion' % ( vo, setup ),
                                         DIRAC_VERSION )
    self.installProject = gConfig.getValue( '/Operations/%s/%s/Versions/PilotProject' % ( vo, setup ),
                                         "" )
    self.install = DIRAC_INSTALL
    self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE


    self.genericPilotDN = PILOT_DN
    self.genericPilotGroup = PILOT_GROUP
    self.enableListMatch = ENABLE_LISTMATCH
    self.listMatchDelay = LISTMATCH_DELAY
    self.listMatchCache = DictCache()

    self.privatePilotFraction = PRIVATE_PILOT_FRACTION

    self.errorClearTime = ERROR_CLEAR_TIME
    self.errorTicketTime = ERROR_TICKET_TIME
    self.errorMailAddress = DIRAC.errorMail
    self.alarmMailAddress = DIRAC.alarmMail
    self.mailFromAddress = FROM_MAIL

    if not  'log' in self.__dict__:
      self.log = gLogger.getSubLogger( 'PilotDirector' )
    self.log.info( 'Initialized' )
Example #8
0
  def __init__( self, submitPool ):
    """
     Define the logger and some defaults
    """

    if submitPool == self.gridMiddleware:
      self.log = gLogger.getSubLogger( '%sPilotDirector' % self.gridMiddleware )
    else:
      self.log = gLogger.getSubLogger( '%sPilotDirector/%s' % ( self.gridMiddleware, submitPool ) )

    self.pilot = DIRAC_PILOT
    self.extraPilotOptions = []
    self.installVersion = DIRAC_VERSION
    self.installInstallation = DIRAC_INSTALLATION

    self.virtualOrganization = VIRTUAL_ORGANIZATION
    self.install = DIRAC_INSTALL
    self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE
    self.targetGrids = [ self.gridMiddleware ]


    self.genericPilotDN = PILOT_DN
    self.genericPilotGroup = PILOT_GROUP
    self.enableListMatch = ENABLE_LISTMATCH
    self.listMatchDelay = LISTMATCH_DELAY
    self.listMatchCache = DictCache()

    self.privatePilotFraction = PRIVATE_PILOT_FRACTION

    self.errorClearTime = ERROR_CLEAR_TIME
    self.errorTicketTime = ERROR_TICKET_TIME
    self.errorMailAddress = DIRAC.errorMail
    self.alarmMailAddress = DIRAC.alarmMail
    self.mailFromAddress = FROM_MAIL

    if not  'log' in self.__dict__:
      self.log = gLogger.getSubLogger( 'PilotDirector' )
    self.log.info( 'Initialized' )
Example #9
0
    def __init__(self, submitPool):
        """
     Define the logger and some defaults
    """

        if submitPool == self.gridMiddleware:
            self.log = gLogger.getSubLogger("%sPilotDirector" % self.gridMiddleware)
        else:
            self.log = gLogger.getSubLogger("%sPilotDirector/%s" % (self.gridMiddleware, submitPool))

        self.pilot = DIRAC_PILOT
        self.submitPoolOption = "-o /Resources/Computing/CEDefaults/SubmitPool=%s" % submitPool
        self.extraPilotOptions = []
        self.installVersion = DIRAC_VERSION
        self.installProject = DIRAC_PROJECT
        self.installation = DIRAC_INSTALLATION

        self.virtualOrganization = VIRTUAL_ORGANIZATION
        self.install = DIRAC_INSTALL
        self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE
        self.targetGrids = [self.gridMiddleware]

        self.enableListMatch = ENABLE_LISTMATCH
        self.listMatchDelay = LISTMATCH_DELAY
        self.listMatchCache = DictCache()

        self.privatePilotFraction = PRIVATE_PILOT_FRACTION

        self.errorClearTime = ERROR_CLEAR_TIME
        self.errorTicketTime = ERROR_TICKET_TIME
        self.errorMailAddress = DIRAC.errorMail
        self.alarmMailAddress = DIRAC.alarmMail
        self.mailFromAddress = FROM_MAIL

        if not "log" in self.__dict__:
            self.log = gLogger.getSubLogger("PilotDirector")
        self.log.info("Initialized")
Example #10
0
class GridPilotDirector( PilotDirector ):
  """
    Base Grid PilotDirector class
    Derived classes must declare:
      self.Middleware: It must correspond to the string before "PilotDirector".
        (For proper naming of the logger)
      self.ResourceBrokers: list of Brokers used by the Director.
        (For proper error reporting)
  """
  def __init__( self, submitPool ):
    """
     Define some defaults and call parent __init__
    """
    self.gridEnv = GRIDENV

    self.cpuPowerRef = CPU_POWER_REF
    self.requirements = REQUIREMENTS
    self.rank = RANK
    self.fuzzyRank = FUZZY_RANK

    self.__failingWMSCache = DictCache()
    self.__ticketsWMSCache = DictCache()
    self.__listMatchWMSCache = DictCache()

    PilotDirector.__init__( self, submitPool )

  def configure( self, csSection, submitPool ):
    """
     Here goes common configuration for all Grid PilotDirectors
    """
    PilotDirector.configure( self, csSection, submitPool )
    self.reloadConfiguration( csSection, submitPool )

    self.__failingWMSCache.purgeExpired()
    self.__ticketsWMSCache.purgeExpired()
    for rb in self.__failingWMSCache.getKeys():
      if rb in self.resourceBrokers:
        try:
          self.resourceBrokers.remove( rb )
        except:
          pass

    self.resourceBrokers = List.randomize( self.resourceBrokers )

    if self.gridEnv:
      self.log.info( ' GridEnv:        ', self.gridEnv )
    if self.resourceBrokers:
      self.log.info( ' ResourceBrokers:', ', '.join( self.resourceBrokers ) )

  def configureFromSection( self, mySection ):
    """
      reload from CS
    """
    PilotDirector.configureFromSection( self, mySection )

    self.gridEnv = gConfig.getValue( mySection + '/GridEnv', self.gridEnv )
    if not self.gridEnv:
      # No specific option found, try a general one
      setup = gConfig.getValue( '/DIRAC/Setup', '' )
      if setup:
        instance = gConfig.getValue( '/DIRAC/Setups/%s/WorkloadManagement' % setup, '' )
        if instance:
          self.gridEnv = gConfig.getValue( '/Systems/WorkloadManagement/%s/GridEnv' % instance, '' )

    self.resourceBrokers = gConfig.getValue( mySection + '/ResourceBrokers'      , self.resourceBrokers )

    self.cpuPowerRef = gConfig.getValue( mySection + '/CPUPowerRef'           , self.cpuPowerRef )
    self.requirements = gConfig.getValue( mySection + '/Requirements'         , self.requirements )
    self.rank = gConfig.getValue( mySection + '/Rank'                 , self.rank )
    self.fuzzyRank = gConfig.getValue( mySection + '/FuzzyRank'            , self.fuzzyRank )

  def _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit,
                     ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ):
    """
      This method does the actual pilot submission to the Grid RB
      The logic is as follows:
      - If there are no available RB it return error
      - If there is no VOMS extension in the proxy, return error
      - It creates a temp directory
      - Prepare a JDL
        it has some part common to gLite and LCG (the payload description)
        it has some part specific to each middleware
    """
    taskQueueID = taskQueueDict['TaskQueueID']
    # ownerDN = taskQueueDict['OwnerDN']
    credDict = proxy.getCredentials()['Value']
    ownerDN = credDict['identity']
    ownerGroup = credDict[ 'group' ]

    if not self.resourceBrokers:
      # Since we can exclude RBs from the list, it may become empty
      return S_ERROR( ERROR_RB )

    # Need to get VOMS extension for the later interactions with WMS
    ret = gProxyManager.getVOMSAttributes( proxy )
    if not ret['OK']:
      self.log.error( ERROR_VOMS, ret['Message'] )
      return S_ERROR( ERROR_VOMS )
    if not ret['Value']:
      return S_ERROR( ERROR_VOMS )
    vomsGroup = ret['Value'][0]

    workingDirectory = tempfile.mkdtemp( prefix = 'TQ_%s_' % taskQueueID, dir = workDir )
    self.log.verbose( 'Using working Directory:', workingDirectory )

    # Write JDL
    retDict = self._prepareJDL( taskQueueDict, workingDirectory, pilotOptions, pilotsPerJob,
                                ceMask, submitPrivatePilot, privateTQ )
    jdl = retDict['JDL']
    pilotRequirements = retDict['Requirements']
    rb = retDict['RB']
    if not jdl:
      try:
        shutil.rmtree( workingDirectory )
      except:
        pass
      return S_ERROR( ERROR_JDL )

    # Check that there are available queues for the Job:
    if self.enableListMatch:
      availableCEs = []
      now = Time.dateTime()
      availableCEs = self.listMatchCache.get( pilotRequirements )
      if availableCEs == False:
        availableCEs = self._listMatch( proxy, jdl, taskQueueID, rb )
        if availableCEs != False:
          self.log.verbose( 'LastListMatch', now )
          self.log.verbose( 'AvailableCEs ', availableCEs )
          self.listMatchCache.add( pilotRequirements, self.listMatchDelay * 60,
                                   value = availableCEs )                      # it is given in minutes
      if not availableCEs:
        try:
          shutil.rmtree( workingDirectory )
        except:
          pass
        return S_ERROR( ERROR_CE + ' TQ: %d' % taskQueueID )

    # Now we are ready for the actual submission, so

    self.log.verbose( 'Submitting Pilots for TaskQueue', taskQueueID )
    submitRet = self._submitPilot( proxy, pilotsPerJob, jdl, taskQueueID, rb )
    try:
      shutil.rmtree( workingDirectory )
    except:
      pass
    if not submitRet:
      return S_ERROR( 'Pilot Submission Failed for TQ %d ' % taskQueueID )
    # pilotReference, resourceBroker = submitRet

    submittedPilots = 0

    if pilotsPerJob != 1 and len( submitRet ) != pilotsPerJob:
      # Parametric jobs are used
      for pilotReference, resourceBroker in submitRet:
        pilotReference = self._getChildrenReferences( proxy, pilotReference, taskQueueID )
        submittedPilots += len( pilotReference )
        pilotAgentsDB.addPilotTQReference( pilotReference, taskQueueID, ownerDN,
                      ownerGroup, resourceBroker, self.gridMiddleware,
                      pilotRequirements )
    else:
      for pilotReference, resourceBroker in submitRet:
        pilotReference = [pilotReference]
        submittedPilots += len( pilotReference )
        pilotAgentsDB.addPilotTQReference( pilotReference, taskQueueID, ownerDN,
                      ownerGroup, resourceBroker, self.gridMiddleware, pilotRequirements )

    # add some sleep here
    time.sleep( 0.1 * submittedPilots )

    if pilotsToSubmit > pilotsPerJob:
      # Additional submissions are necessary, need to get a new token and iterate.
      pilotsToSubmit -= pilotsPerJob
      result = gProxyManager.requestToken( ownerDN, ownerGroup, max( pilotsToSubmit, self.maxJobsInFillMode ) )
      if not result[ 'OK' ]:
        self.log.error( ERROR_TOKEN, result['Message'] )
        result = S_ERROR( ERROR_TOKEN )
        result['Value'] = submittedPilots
        return result
      ( token, numberOfUses ) = result[ 'Value' ]
      for option in pilotOptions:
        if option.find( '-o /Security/ProxyToken=' ) == 0:
          pilotOptions.remove( option )
      pilotOptions.append( '-o /Security/ProxyToken=%s' % token )
      pilotsPerJob = max( 1, min( pilotsPerJob, int( numberOfUses / self.maxJobsInFillMode ) ) )
      result = self._submitPilots( workDir, taskQueueDict, pilotOptions,
                                   pilotsToSubmit, ceMask,
                                   submitPrivatePilot, privateTQ,
                                   proxy, pilotsPerJob )
      if not result['OK']:
        if 'Value' not in result:
          result['Value'] = 0
        result['Value'] += submittedPilots
        return result
      submittedPilots += result['Value']

    return S_OK( submittedPilots )

  def _prepareJDL( self, taskQueueDict, workingDirectory, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ ):
    """
      This method should be overridden in a subclass
    """
    self.log.error( '_prepareJDL() method should be implemented in a subclass' )
    sys.exit()

  def _JobJDL( self, taskQueueDict, pilotOptions, ceMask ):
    """
     The Job JDL is the same for LCG and GLite
    """
    pilotJDL = 'Executable     = "%s";\n' % os.path.basename( self.pilot )
    executable = self.pilot

    pilotJDL += 'Arguments     = "%s";\n' % ' '.join( pilotOptions )

    pilotJDL += 'CPUTimeRef    = %s;\n' % taskQueueDict['CPUTime']

    pilotJDL += 'CPUPowerRef   = %s;\n' % self.cpuPowerRef

    pilotJDL += """CPUWorkRef    = real( CPUTimeRef * CPUPowerRef );

Lookup        = "CPUScalingReferenceSI00=*";
cap = isList( other.GlueCECapability ) ? other.GlueCECapability : { "dummy" };
i0 = regexp( Lookup, cap[0] ) ? 0 : undefined;
i1 = isString( cap[1] ) && regexp( Lookup, cap[1] ) ? 1 : i0;
i2 = isString( cap[2] ) && regexp( Lookup, cap[2] ) ? 2 : i1;
i3 = isString( cap[3] ) && regexp( Lookup, cap[3] ) ? 3 : i2;
i4 = isString( cap[4] ) && regexp( Lookup, cap[4] ) ? 4 : i3;
i5 = isString( cap[5] ) && regexp( Lookup, cap[5] ) ? 5 : i4;
index = isString( cap[6] ) && regexp( Lookup, cap[6] ) ? 6 : i5;
i = isUndefined( index ) ? 0 : index;

QueuePowerRef = real( ! isUndefined( index ) ? int( substr( cap[i], size( Lookup ) - 1 ) ) : other.GlueHostBenchmarkSI00 );
QueueTimeRef  = real( other.GlueCEPolicyMaxCPUTime * 60 );
QueueWorkRef  = QueuePowerRef * QueueTimeRef;
"""

    requirements = list( self.requirements )
    if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']:
      # if there an explicit Grig CE requested by the TQ, remove the Ranking requirement
      for req in self.requirements:
        if req.strip().lower()[:6] == 'rank >':
          requirements.remove( req )

    requirements.append( 'QueueWorkRef > CPUWorkRef' )

    siteRequirements = '\n || '.join( [ 'other.GlueCEInfoHostName == "%s"' % s for s in ceMask ] )
    requirements.append( "( %s\n )" % siteRequirements )

    pilotRequirements = '\n && '.join( requirements )

    pilotJDL += 'pilotRequirements  = %s;\n' % pilotRequirements

    pilotJDL += 'Rank          = %s;\n' % self.rank
    pilotJDL += 'FuzzyRank     = %s;\n' % self.fuzzyRank
    pilotJDL += 'StdOutput     = "%s";\n' % outputSandboxFiles[0]
    pilotJDL += 'StdError      = "%s";\n' % outputSandboxFiles[1]

    pilotJDL += 'InputSandbox  = { "%s" };\n' % '", "'.join( [ self.install, executable ] )

    pilotJDL += 'OutputSandbox = { %s };\n' % ', '.join( [ '"%s"' % f for f in outputSandboxFiles ] )

    self.log.verbose( pilotJDL )

    return ( pilotJDL, pilotRequirements )


  def parseListMatchStdout( self, proxy, cmd, taskQueueID, rb ):
    """
      Parse List Match stdout to return list of matched CE's
    """
    self.log.verbose( 'Executing List Match for TaskQueue', taskQueueID )

    start = time.time()
    ret = executeGridCommand( proxy, cmd, self.gridEnv )

    if not ret['OK']:
      self.log.error( 'Failed to execute List Match:', ret['Message'] )
      self.__sendErrorMail( rb, 'List Match', cmd, ret, proxy )
      return False
    if ret['Value'][0] != 0:
      self.log.error( 'Error executing List Match:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
      self.__sendErrorMail( rb, 'List Match', cmd, ret, proxy )
      return False
    self.log.info( 'List Match Execution Time: %.2f for TaskQueue %d' % ( ( time.time() - start ), taskQueueID ) )

    stdout = ret['Value'][1]
    stderr = ret['Value'][2]
    availableCEs = []
    # Parse std.out
    for line in List.fromChar( stdout, '\n' ):
      if re.search( '/jobmanager-', line ) or re.search( '/cream-', line ):
        # TODO: the line has to be stripped from extra info
        availableCEs.append( line )

    if not availableCEs:
      self.log.info( 'List-Match failed to find CEs for TaskQueue', taskQueueID )
      self.log.info( stdout )
      self.log.info( stderr )
    else:
      self.log.debug( 'List-Match returns:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
      self.log.info( 'List-Match found %s CEs for TaskQueue' % len( availableCEs ), taskQueueID )
      self.log.verbose( ', '.join( availableCEs ) )


    return availableCEs

  def parseJobSubmitStdout( self, proxy, cmd, taskQueueID, rb ):
    """
      Parse Job Submit stdout to return pilot reference
    """
    start = time.time()
    self.log.verbose( 'Executing Job Submit for TaskQueue', taskQueueID )

    ret = executeGridCommand( proxy, cmd, self.gridEnv )

    if not ret['OK']:
      self.log.error( 'Failed to execute Job Submit:', ret['Message'] )
      self.__sendErrorMail( rb, 'Job Submit', cmd, ret, proxy )
      return False
    if ret['Value'][0] != 0:
      self.log.error( 'Error executing Job Submit:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
      self.__sendErrorMail( rb, 'Job Submit', cmd, ret, proxy )
      return False
    self.log.info( 'Job Submit Execution Time: %.2f for TaskQueue %d' % ( ( time.time() - start ), taskQueueID ) )

    stdout = ret['Value'][1]
    stderr = ret['Value'][2]

    submittedPilot = None

    failed = 1
    rb = ''
    for line in List.fromChar( stdout, '\n' ):
      m = re.search( "(https:\S+)", line )
      if ( m ):
        glite_id = m.group( 1 )
        submittedPilot = glite_id
        if not rb:
          m = re.search( "https://(.+):.+", glite_id )
          rb = m.group( 1 )
        failed = 0
    if failed:
      self.log.error( 'Job Submit returns no Reference:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
      return False

    self.log.info( 'Reference %s for TaskQueue %s' % ( glite_id, taskQueueID ) )

    return glite_id, rb

  def _writeJDL( self, filename, jdlList ):
    try:
      f = open( filename, 'w' )
      f.write( '\n'.join( jdlList ) )
      f.close()
    except Exception, x:
      self.log.exception()
      return ''

    return filename
Example #11
0
class PilotDirector:
  """
    Base Pilot Director class.
    Derived classes must implement:
      * __init__( self, submitPool ):
          that must call the parent class __init__ method and then do its own initialization
      * configure( self, csSection, submitPool ):
          that must call the parent class configure method and the do its own configuration
      * _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask,
                      submitPrivatePilot, privateTQ, proxy, pilotsPerJob )
          actual method doing the submission to the backend once the submitPilots method
          has prepared the common part

    Derived classes might implement:
      * configureFromSection( self, mySection ):
          to reload from a CS section the additional datamembers they might have defined.

    If additional datamembers are defined, they must:
      - be declared in the __init__
      - be reconfigured in the configureFromSection method by executing
        self.reloadConfiguration( csSection, submitPool ) in theri configure method
  """
  gridMiddleware = ''

  def __init__( self, submitPool ):
    """
     Define the logger and some defaults
    """

    if submitPool == self.gridMiddleware:
      self.log = gLogger.getSubLogger( '%sPilotDirector' % self.gridMiddleware )
    else:
      self.log = gLogger.getSubLogger( '%sPilotDirector/%s' % ( self.gridMiddleware, submitPool ) )

    self.pilot = DIRAC_PILOT
    self.extraPilotOptions = []
    self.installVersion = DIRAC_VERSION
    self.installInstallation = DIRAC_INSTALLATION

    self.virtualOrganization = VIRTUAL_ORGANIZATION
    self.install = DIRAC_INSTALL
    self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE


    self.genericPilotDN = PILOT_DN
    self.genericPilotGroup = PILOT_GROUP
    self.enableListMatch = ENABLE_LISTMATCH
    self.listMatchDelay = LISTMATCH_DELAY
    self.listMatchCache = DictCache()

    self.privatePilotFraction = PRIVATE_PILOT_FRACTION

    self.errorClearTime = ERROR_CLEAR_TIME
    self.errorTicketTime = ERROR_TICKET_TIME
    self.errorMailAddress = DIRAC.errorMail
    self.alarmMailAddress = DIRAC.alarmMail
    self.mailFromAddress = FROM_MAIL

    if not  'log' in self.__dict__:
      self.log = gLogger.getSubLogger( 'PilotDirector' )
    self.log.info( 'Initialized' )

  def configure( self, csSection, submitPool ):
    """
     Here goes common configuration for all PilotDirectors
    """
    self.configureFromSection( csSection )
    self.reloadConfiguration( csSection, submitPool )

    setup = gConfig.getValue( '/DIRAC/Setup', '' )
    section = cfgPath( 'Operations', self.virtualOrganization, setup, 'Versions' )
    self.installVersion = gConfig.getValue( cfgPath( section, 'PilotVersion' ),
                                         self.installVersion )
    self.installInstallation = gConfig.getValue( cfgPath( section, 'PilotInstallation' ),
                                         self.installInstallation )

    self.log.info( '===============================================' )
    self.log.info( 'Configuration:' )
    self.log.info( '' )
    self.log.info( ' Install script: ', self.install )
    self.log.info( ' Pilot script:   ', self.pilot )
    self.log.info( ' Install Ver:    ', self.installVersion )
    if self.installInstallation:
      self.log.info( ' Installation:        ', self.installInstallation )
    if self.extraPilotOptions:
      self.log.info( ' Exta Options:   ', ' '.join( self.extraPilotOptions ) )
    self.log.info( ' ListMatch:      ', self.enableListMatch )
    self.log.info( ' Private %:      ', self.privatePilotFraction * 100 )
    if self.enableListMatch:
      self.log.info( ' ListMatch Delay:', self.listMatchDelay )
    self.listMatchCache.purgeExpired()

  def reloadConfiguration( self, csSection, submitPool ):
    """
     Common Configuration can be overwriten for each GridMiddleware
    """
    mySection = csSection + '/' + self.gridMiddleware
    self.configureFromSection( mySection )

    # And Again for each SubmitPool
    mySection = csSection + '/' + submitPool
    self.configureFromSection( mySection )

  def configureFromSection( self, mySection ):
    """
      reload from CS
    """
    self.pilot = gConfig.getValue( mySection + '/PilotScript'          , self.pilot )
    self.installVersion = gConfig.getValue( mySection + '/DIRACVersion'         , self.installVersion )
    self.extraPilotOptions = gConfig.getValue( mySection + '/ExtraPilotOptions'    , self.extraPilotOptions )
    self.install = gConfig.getValue( mySection + '/InstallScript'        , self.install )
    self.installInstallation = gConfig.getValue( mySection + '/Installation'        , self.installInstallation )
    self.maxJobsInFillMode = gConfig.getValue( mySection + '/MaxJobsInFillMode'    , self.maxJobsInFillMode )

    self.enableListMatch = gConfig.getValue( mySection + '/EnableListMatch'      , self.enableListMatch )
    self.listMatchDelay = gConfig.getValue( mySection + '/ListMatchDelay'       , self.listMatchDelay )
    self.errorClearTime = gConfig.getValue( mySection + '/ErrorClearTime'       , self.errorClearTime )
    self.errorTicketTime = gConfig.getValue( mySection + '/ErrorTicketTime'      , self.errorTicketTime )
    self.errorMailAddress = gConfig.getValue( mySection + '/ErrorMailAddress'     , self.errorMailAddress )
    self.alarmMailAddress = gConfig.getValue( mySection + '/AlarmMailAddress'     , self.alarmMailAddress )
    self.mailFromAddress = gConfig.getValue( mySection + '/MailFromAddress'      , self.mailFromAddress )
    self.genericPilotDN = gConfig.getValue( mySection + '/GenericPilotDN'       , self.genericPilotDN )
    self.genericPilotGroup = gConfig.getValue( mySection + '/GenericPilotGroup'    , self.genericPilotGroup )
    self.privatePilotFraction = gConfig.getValue( mySection + '/PrivatePilotFraction' , self.privatePilotFraction )

    virtualOrganization = gConfig.getValue( mySection + '/VirtualOrganization' , '' )
    if not virtualOrganization:
      virtualOrganization = getVOForGroup( 'NonExistingGroup' )
      if not virtualOrganization:
        virtualOrganization = self.virtualOrganization
    self.virtualOrganization = virtualOrganization

  def _resolveCECandidates( self, taskQueueDict ):
    """
      Return a list of CEs for this TaskQueue
    """
    # assume user knows what they're doing and avoid site mask e.g. sam jobs
    if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']:
      self.log.info( 'CEs requested by TaskQueue %s:' % taskQueueDict['TaskQueueID'],
                     ', '.join( taskQueueDict['GridCEs'] ) )
      return taskQueueDict['GridCEs']

    # Get the mask
    ret = jobDB.getSiteMask()
    if not ret['OK']:
      self.log.error( 'Can not retrieve site Mask from DB:', ret['Message'] )
      return []

    siteMask = ret['Value']
    if not siteMask:
      self.log.error( 'Site mask is empty' )
      return []

    self.log.verbose( 'Site Mask: %s' % ', '.join( siteMask ) )

    # remove banned sites from siteMask
    if 'BannedSites' in taskQueueDict:
      for site in taskQueueDict['BannedSites']:
        if site in siteMask:
          siteMask.remove( site )
          self.log.verbose( 'Removing banned site %s from site Mask' % site )

    # remove from the mask if a Site is given
    siteMask = [ site for site in siteMask if 'Sites' not in taskQueueDict or site in taskQueueDict['Sites'] ]

    if not siteMask:
      # pilot can not be submitted
      self.log.info( 'No Valid Site Candidate in Mask for TaskQueue %s' % taskQueueDict['TaskQueueID'] )
      return []

    self.log.info( 'Site Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( siteMask ) )

    # Get CE's associates to the given site Names
    ceMask = []

    section = '/Resources/Sites/%s' % self.gridMiddleware
    ret = gConfig.getSections( section )
    if not ret['OK']:
      # To avoid duplicating sites listed in LCG for gLite for example.
      # This could be passed as a parameter from
      # the sub class to avoid below...
      section = '/Resources/Sites/LCG'
      ret = gConfig.getSections( section )

    if not ret['OK'] or not ret['Value']:
      self.log.error( 'Could not obtain CEs from CS', ret['Message'] )
      return []

    gridSites = ret['Value']
    for siteName in gridSites:
      if siteName in siteMask:
        ret = gConfig.getValue( '%s/%s/CE' % ( section, siteName ), [] )
        for ce in ret:
          submissionMode = gConfig.getValue( '%s/%s/CEs/%s/SubmissionMode' % ( section, siteName, ce ), 'gLite' )
          if submissionMode == self.gridMiddleware:
            ceMask.append( ce )

    if not ceMask:
      self.log.info( 'No CE Candidate found for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( siteMask ) )

    self.log.verbose( 'CE Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( ceMask ) )

    return ceMask

  def _getPilotOptions( self, taskQueueDict, pilotsToSubmit ):

    # Need to limit the maximum number of pilots to submit at once 
    # For generic pilots this is limited by the number of use of the tokens and the 
    # maximum number of jobs in Filling mode, but for private Jobs we need an extra limitation:
    pilotsToSubmit = min( pilotsToSubmit, int( 50 / self.maxJobsInFillMode ) )
    pilotOptions = [ "-V %s" % self.virtualOrganization ]
    privateIfGenericTQ = self.privatePilotFraction > random.random()
    privateTQ = ( 'PilotTypes' in taskQueueDict and 'private' in [ t.lower() for t in taskQueueDict['PilotTypes'] ] )
    forceGeneric = 'ForceGeneric' in taskQueueDict
    submitPrivatePilot = ( privateIfGenericTQ or privateTQ ) and not forceGeneric
    if submitPrivatePilot:
      self.log.verbose( 'Submitting private pilots for TaskQueue %s' % taskQueueDict['TaskQueueID'] )
      ownerDN = taskQueueDict['OwnerDN']
      ownerGroup = taskQueueDict['OwnerGroup']
      # User Group requirement
      pilotOptions.append( '-G %s' % taskQueueDict['OwnerGroup'] )
      # check if group allows jobsharing
      ownerGroupProperties = getPropertiesForGroup( ownerGroup )
      if not 'JobSharing' in ownerGroupProperties:
        # Add Owner requirement to pilot
        pilotOptions.append( "-O '%s'" % ownerDN )
      if privateTQ:
        pilotOptions.append( '-o /Resources/Computing/CEDefaults/PilotType=private' )
      maxJobsInFillMode = self.maxJobsInFillMode
    else:
      #For generic jobs we'll submit mixture of generic and private pilots
      self.log.verbose( 'Submitting generic pilots for TaskQueue %s' % taskQueueDict['TaskQueueID'] )
      ownerDN = self.genericPilotDN
      ownerGroup = self.genericPilotGroup
      result = gProxyManager.requestToken( ownerDN, ownerGroup, max( pilotsToSubmit, self.maxJobsInFillMode ) )
      if not result[ 'OK' ]:
        self.log.error( ERROR_TOKEN, result['Message'] )
        return S_ERROR( ERROR_TOKEN )
      ( token, numberOfUses ) = result[ 'Value' ]
      pilotsToSubmit = min( numberOfUses, pilotsToSubmit )

      pilotOptions.append( '-o /Security/ProxyToken=%s' % token )

      pilotsToSubmit = ( pilotsToSubmit - 1 ) / self.maxJobsInFillMode + 1

      maxJobsInFillMode = int( numberOfUses / pilotsToSubmit )
    # Use Filling mode
    pilotOptions.append( '-M %s' % maxJobsInFillMode )

    # Debug
    pilotOptions.append( '-d' )
    # Setup.
    pilotOptions.append( '-S %s' % taskQueueDict['Setup'] )
    # CS Servers
    csServers = gConfig.getValue( "/DIRAC/Configuration/Servers", [] )
    pilotOptions.append( '-C %s' % ",".join( csServers ) )
    # DIRAC Extensions
    extensionsList = getCSExtensions()
    if extensionsList:
      pilotOptions.append( '-e %s' % ",".join( extensionsList ) )
    # Requested version of DIRAC
    pilotOptions.append( '-r %s' % self.installVersion )
    # Requested Project to install
    pilotOptions.append( '-V %s' % self.installInstallation )
    # Requested CPU time
    pilotOptions.append( '-T %s' % taskQueueDict['CPUTime'] )

    if self.extraPilotOptions:
      pilotOptions.extend( self.extraPilotOptions )

    return S_OK( ( pilotOptions, pilotsToSubmit, ownerDN, ownerGroup, submitPrivatePilot, privateTQ ) )

  def _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit,
                     ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ):
    """
      This method must be implemented on the Backend specific derived class.
      This is problem with the Director, not with the Job so we must return S_OK
      Return S_ERROR if not defined.
    """
    self.log.error( '_submitPilots method not implemented' )
    return S_OK()


  def submitPilots( self, taskQueueDict, pilotsToSubmit, workDir = None ):
    """
      Submit pilot for the given TaskQueue,
      this method just insert the request in the corresponding ThreadPool,
      the submission is done from the Thread Pool job
    """
    try:

      taskQueueID = taskQueueDict['TaskQueueID']

      self.log.verbose( 'Submitting Pilot' )
      ceMask = self._resolveCECandidates( taskQueueDict )
      if not ceMask:
        return S_ERROR( 'No CE available for TaskQueue %d' % int( taskQueueID ) )
      result = self._getPilotOptions( taskQueueDict, pilotsToSubmit )
      if not result['OK']:
        return result
      ( pilotOptions, pilotsPerJob, ownerDN, ownerGroup, submitPrivatePilot, privateTQ ) = result['Value']
      # get a valid proxy, submit with a long proxy to avoid renewal
      ret = self._getPilotProxyFromDIRACGroup( ownerDN, ownerGroup, requiredTimeLeft = 86400 * 5 )
      if not ret['OK']:
        self.log.error( ret['Message'] )
        self.log.error( 'No proxy Available', 'User "%s", Group "%s"' % ( ownerDN, ownerGroup ) )
        return S_ERROR( ERROR_PROXY )
      proxy = ret['Value']
      # Now call a Grid Specific method to handle the final submission of the pilots
      return self._submitPilots( workDir, taskQueueDict, pilotOptions,
                                 pilotsToSubmit, ceMask,
                                 submitPrivatePilot, privateTQ,
                                 proxy, pilotsPerJob )

    except Exception:
      self.log.exception( 'Error in Pilot Submission' )

    return S_OK( 0 )

  def _getPilotProxyFromDIRACGroup( self, ownerDN, ownerGroup, requiredTimeLeft ):
    """
     To be overwritten if a given Pilot does not require a full proxy
    """
    return gProxyManager.getPilotProxyFromDIRACGroup( ownerDN, ownerGroup, requiredTimeLeft )

  def exceptionCallBack( self, threadedJob, exceptionInfo ):
    self.log.exception( 'Error in Pilot Submission' )
Example #12
0
class DIRACPilotDirector(PilotDirector):
    """
    DIRAC PilotDirector class
  """
    def __init__(self, submitPool):
        """
     Define some defaults and call parent __init__
    """
        self.gridMiddleware = 'DIRAC'

        PilotDirector.__init__(self, submitPool)

        self.computingElementList = COMPUTING_ELEMENTS
        self.computingElementDict = {}
        self.addComputingElement(self.computingElementList)

        self.siteName = gConfig.getValue('/LocalSite/Site', '')
        if not self.siteName:
            self.log.error(
                'Can not run a Director if Site Name is not defined')
            sys.exit()

        self.__failingCECache = DictCache()
        self.__ticketsCECache = DictCache()

    def configure(self, csSection, submitPool):
        """
     Here goes common configuration for DIRAC PilotDirector
    """

        PilotDirector.configure(self, csSection, submitPool)
        self.reloadConfiguration(csSection, submitPool)

        self.__failingCECache.purgeExpired()
        self.__ticketsCECache.purgeExpired()

        for ce in self.__failingCECache.getKeys():
            if ce in self.computingElementDict.keys():
                try:
                    del self.computingElementDict[ce]
                except:
                    pass
        if self.computingElementDict:
            self.log.info(' ComputingElements:',
                          ', '.join(self.computingElementDict.keys()))
        else:
            return

        # FIXME: this is to start testing
        ceName, computingElementDict = self.computingElementDict.items()[0]

        self.computingElement = computingElementDict['CE']

        self.log.debug(self.computingElement.getDynamicInfo())

        self.log.info(' SiteName:', self.siteName)

    def configureFromSection(self, mySection):
        """
      reload from CS
    """
        PilotDirector.configureFromSection(self, mySection)

        self.computingElementList = gConfig.getValue(
            mySection + '/ComputingElements', self.computingElementList)
        self.addComputingElement(self.computingElementList)

        self.siteName = gConfig.getValue(mySection + '/SiteName',
                                         self.siteName)

    def addComputingElement(self, ceList):
        """
      Check if a CE object for the current CE is available,
      instantiate one if necessary
    """
        for CE in ceList:
            if CE not in self.computingElementDict:
                ceFactory = ComputingElementFactory()
                ceInstance = ceFactory.getCE(ceName=CE)
                if not ceInstance['OK']:
                    self.log.error('Can not create CE object:',
                                   ceInstance['Message'])
                    return
                self.computingElementDict[CE] = ceInstance[
                    'Value'].ceConfigDict
                # add the 'CE' instance at the end to avoid being overwritten
                self.computingElementDict[CE]['CE'] = ceInstance['Value']

    def _submitPilots(self, workDir, taskQueueDict, pilotOptions,
                      pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ,
                      proxy, pilotsPerJob):
        """
      This method does the actual pilot submission to the DIRAC CE
      The logic is as follows:
      - If there are no available CE it return error
      - If there is no queue available in the CE's, it returns error
      - It creates a temp directory
      - It prepare a PilotScript
    """

        taskQueueID = taskQueueDict['TaskQueueID']
        ownerDN = taskQueueDict['OwnerDN']

        submittedPilots = 0

        # if self.computingElement not in self.computingElementDict:
        #  # Since we can exclude CEs from the list, it may become empty
        #  return S_ERROR( ERROR_CE )

        pilotRequirements = []
        pilotRequirements.append(('CPUTime', taskQueueDict['CPUTime']))
        # do we need to care about anything else?
        pilotRequirementsString = str(pilotRequirements)

        # Check that there are available queues for the Jobs:
        if self.enableListMatch:
            availableQueues = []
            # now = Time.dateTime()
            cachedAvailableQueues = self.listMatchCache.get(
                pilotRequirementsString)
            if cachedAvailableQueues == False:
                availableQueues = self._listQueues(pilotRequirements)
                if availableQueues != False:
                    self.listMatchCache.add(pilotRequirementsString,
                                            self.listMatchDelay,
                                            availableQueues)
                    self.log.verbose(
                        'Available Queues for TaskQueue ',
                        "%s: %s" % (taskQueueID, str(availableQueues)))
            else:
                availableQueues = cachedAvailableQueues

        if not availableQueues:
            return S_ERROR(ERROR_CE + ' TQ: %d' % taskQueueID)

        baseDir = os.getcwd()
        workingDirectory = tempfile.mkdtemp(prefix='TQ_%s_' % taskQueueID,
                                            dir=workDir)
        self.log.verbose('Using working Directory:', workingDirectory)
        os.chdir(workingDirectory)

        # set the Site Name
        pilotOptions.append("-n '%s'" % self.siteName)

        # submit pilots for every CE available

        for CE in self.computingElementDict.keys():
            ceName = CE
            computingElement = self.computingElementDict[CE]['CE']

            # add possible requirements from Site and CE
            for req, val in getResourceDict(ceName).items():
                pilotOptions.append("-o '/AgentJobRequirements/%s=%s'" %
                                    (req, val))

            ceConfigDict = self.computingElementDict[CE]

            if 'ClientPlatform' in ceConfigDict:
                pilotOptions.append("-p '%s'" % ceConfigDict['ClientPlatform'])

            if 'SharedArea' in ceConfigDict:
                pilotOptions.append("-o '/LocalSite/SharedArea=%s'" %
                                    ceConfigDict['SharedArea'])

            if 'CPUScalingFactor' in ceConfigDict:
                pilotOptions.append("-o '/LocalSite/CPUScalingFactor=%s'" %
                                    ceConfigDict['CPUScalingFactor'])

            if 'CPUNormalizationFactor' in ceConfigDict:
                pilotOptions.append(
                    "-o '/LocalSite/CPUNormalizationFactor=%s'" %
                    ceConfigDict['CPUNormalizationFactor'])

                self.log.info("pilotOptions: ", ' '.join(pilotOptions))

            httpProxy = ''
            if 'HttpProxy' in ceConfigDict:
                httpProxy = ceConfigDict['HttpProxy']

            pilotDir = ''
            if 'JobExecDir' in ceConfigDict:
                pilotExecDir = ceConfigDict['JobExecDir']

            try:
                pilotScript = self._writePilotScript(workingDirectory,
                                                     pilotOptions, proxy,
                                                     httpProxy, pilotExecDir)
            except:
                self.log.exception(ERROR_SCRIPT)
                try:
                    os.chdir(baseDir)
                    shutil.rmtree(workingDirectory)
                except:
                    pass
                return S_ERROR(ERROR_SCRIPT)

            self.log.info("Pilots to submit: ", pilotsToSubmit)
            while submittedPilots < pilotsToSubmit:
                # Find out how many pilots can be submitted
                ret = computingElement.available()
                if not ret['OK']:
                    self.log.error(
                        'Can not determine if pilot should be submitted: ',
                        ret['Message'])
                    break
                maxPilotsToSubmit = ret['Value']
                self.log.info("Submit Pilots: ", maxPilotsToSubmit)
                if not maxPilotsToSubmit:
                    break
                # submit the pilots and then check again
                for i in range(
                        min(maxPilotsToSubmit,
                            pilotsToSubmit - submittedPilots)):
                    submission = computingElement.submitJob(
                        pilotScript, '', '')
                    if not submission['OK']:
                        self.log.error('Pilot submission failed: ',
                                       submission['Message'])
                        # cleanup
                        try:
                            os.chdir(baseDir)
                            shutil.rmtree(workingDirectory)
                        except:
                            pass
                        return S_ERROR('Pilot submission failed after ' +
                                       str(submittedPilots) +
                                       ' pilots submitted successful')
                    submittedPilots += 1
                    # let the batch system some time to digest the submitted job
                    time.sleep(1)

            #next CE

        try:
            os.chdir(baseDir)
            shutil.rmtree(workingDirectory)
        except:
            pass

        return S_OK(submittedPilots)

    def _listQueues(self, pilotRequirements):
        """
     For each defined CE return the list of Queues with available, running and waiting slots,
     matching the requirements of the pilots.
     Currently only CPU time is considered
    """
        availableQueues = []
        result = self.computingElement.available(pilotRequirements)
        if not result['OK']:
            self.log.error('Can not determine available queues',
                           result['Message'])
            return False
        return result['Value']

    def _writePilotScript(self, workingDirectory, pilotOptions, proxy,
                          httpProxy, pilotExecDir):
        """
     Prepare the script to execute the pilot
     For the moment it will do like Grid Pilots, a full DIRAC installation

     It assumes that the pilot script will have access to the submit working directory
    """
        try:
            compressedAndEncodedProxy = base64.encodestring(
                bz2.compress(proxy.dumpAllToString()['Value'])).replace(
                    '\n', '')
            compressedAndEncodedPilot = base64.encodestring(
                bz2.compress(open(self.pilot, "rb").read(),
                             9)).replace('\n', '')
            compressedAndEncodedInstall = base64.encodestring(
                bz2.compress(open(self.install, "rb").read(),
                             9)).replace('\n', '')
        except:
            self.log.exception(
                'Exception during file compression of proxy, dirac-pilot or dirac-install'
            )
            return S_ERROR(
                'Exception during file compression of proxy, dirac-pilot or dirac-install'
            )

        localPilot = """#!/bin/bash
/usr/bin/env python << EOF
#
import os, tempfile, sys, shutil, base64, bz2
try:
  pilotExecDir = '%(pilotExecDir)s'
  if not pilotExecDir:
    pilotExecDir = None 
  pilotWorkingDirectory = tempfile.mkdtemp( suffix = 'pilot', prefix = 'DIRAC_', dir = pilotExecDir )
  os.chdir( pilotWorkingDirectory )
  open( 'proxy', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedProxy)s" ) ) )
  open( '%(pilotScript)s', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedPilot)s" ) ) )
  open( '%(installScript)s', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedInstall)s" ) ) )
  os.chmod("proxy",0600)
  os.chmod("%(pilotScript)s",0700)
  os.chmod("%(installScript)s",0700)
  if "LD_LIBRARY_PATH" not in os.environ:
    os.environ["LD_LIBRARY_PATH"]=""
  os.environ["X509_USER_PROXY"]=os.path.join(pilotWorkingDirectory, 'proxy')
  if "%(httpProxy)s":
    os.environ["HTTP_PROXY"]="%(httpProxy)s"
  os.environ["X509_CERT_DIR"]=os.path.join(pilotWorkingDirectory, 'etc/grid-security/certificates')
  # TODO: structure the output
  print '==========================================================='
  print 'Environment of execution host'
  for key in os.environ.keys():
    print key + '=' + os.environ[key]
  print '==========================================================='
except Exception, x:
  print >> sys.stderr, x
  sys.exit(-1)
cmd = "python %(pilotScript)s %(pilotOptions)s"
print 'Executing: ', cmd
sys.stdout.flush()
os.system( cmd )

shutil.rmtree( pilotWorkingDirectory )

EOF
""" % {
            'compressedAndEncodedProxy': compressedAndEncodedProxy,
            'compressedAndEncodedPilot': compressedAndEncodedPilot,
            'compressedAndEncodedInstall': compressedAndEncodedInstall,
            'httpProxy': httpProxy,
            'pilotScript': os.path.basename(self.pilot),
            'installScript': os.path.basename(self.install),
            'pilotOptions': ' '.join(pilotOptions),
            'pilotExecDir': pilotExecDir
        }

        fd, name = tempfile.mkstemp(suffix='_pilotwrapper.py',
                                    prefix='DIRAC_',
                                    dir=workingDirectory)
        pilotWrapper = os.fdopen(fd, 'w')
        pilotWrapper.write(localPilot)
        pilotWrapper.close()

        return name

    def _getPilotProxyFromDIRACGroup(self, ownerDN, ownerGroup,
                                     requiredTimeLeft):
        """
    Download a limited pilot proxy with VOMS extensions depending on the group
    """
        #Assign VOMS attribute
        vomsAttr = CS.getVOMSAttributeForGroup(ownerGroup)
        if not vomsAttr:
            self.log.info("Downloading a proxy without VOMS extensions")
            return gProxyManager.downloadProxy(
                ownerDN,
                ownerGroup,
                limited=True,
                requiredTimeLeft=requiredTimeLeft)
        else:
            self.log.info("Downloading a proxy with '%s' VOMS extension" %
                          vomsAttr)
            return gProxyManager.downloadVOMSProxy(
                ownerDN,
                ownerGroup,
                limited=True,
                requiredTimeLeft=requiredTimeLeft,
                requiredVOMSAttribute=vomsAttr)
Example #13
0
class DIRACPilotDirector(PilotDirector):
  """
    DIRAC PilotDirector class
  """
  def __init__( self, submitPool ):
    """
     Define some defaults and call parent __init__
    """
    self.gridMiddleware    = 'DIRAC'

    PilotDirector.__init__( self, submitPool )

    self.computingElementList = COMPUTING_ELEMENTS
    self.computingElementDict = {}
    self.addComputingElement( self.computingElementList )

    self.siteName          = gConfig.getValue('/LocalSite/Site','')
    if not self.siteName:
      self.log.error( 'Can not run a Director if Site Name is not defined' )
      sys.exit()

    self.__failingCECache  = DictCache()
    self.__ticketsCECache  = DictCache()

  def configure(self, csSection, submitPool ):
    """
     Here goes common configuration for DIRAC PilotDirector
    """

    PilotDirector.configure( self, csSection, submitPool )
    self.reloadConfiguration( csSection, submitPool )

    self.__failingCECache.purgeExpired()
    self.__ticketsCECache.purgeExpired()

    for ce in self.__failingCECache.getKeys():
      if ce in self.computingElementDict.keys():
        try:
          del self.computingElementDict[ce]
        except:
          pass
    if self.computingElementDict:
      self.log.info( ' ComputingElements:', ', '.join(self.computingElementDict.keys()) )
    else:
      return

    # FIXME: this is to start testing
    ceName, computingElementDict = self.computingElementDict.items()[0]

    self.computingElement = computingElementDict['CE']

    self.log.debug( self.computingElement.getDynamicInfo() )

    self.log.info( ' SiteName:', self.siteName )


  def configureFromSection( self, mySection ):
    """
      reload from CS
    """
    PilotDirector.configureFromSection( self, mySection )

    self.computingElementList = gConfig.getValue( mySection+'/ComputingElements'      , self.computingElementList )
    self.addComputingElement( self.computingElementList )

    self.siteName             = gConfig.getValue( mySection+'/SiteName'               , self.siteName )


  def addComputingElement(self, ceList):
    """
      Check if a CE object for the current CE is available,
      instantiate one if necessary
    """
    for CE in ceList:
      if CE not in self.computingElementDict:
        ceFactory = ComputingElementFactory( )
        ceInstance = ceFactory.getCE( ceName = CE )
        if not ceInstance['OK']:
          self.log.error('Can not create CE object:', ceInstance['Message'])
          return
        self.computingElementDict[CE] = ceInstance['Value'].ceConfigDict
        # add the 'CE' instance at the end to avoid being overwritten
        self.computingElementDict[CE]['CE'] = ceInstance['Value']


  def _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit,
                     ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ):
    """
      This method does the actual pilot submission to the DIRAC CE
      The logic is as follows:
      - If there are no available CE it return error
      - If there is no queue available in the CE's, it returns error
      - It creates a temp directory
      - It prepare a PilotScript
    """

    taskQueueID = taskQueueDict['TaskQueueID']
    ownerDN = taskQueueDict['OwnerDN']

    submittedPilots = 0

    # if self.computingElement not in self.computingElementDict:
    #  # Since we can exclude CEs from the list, it may become empty
    #  return S_ERROR( ERROR_CE )

    pilotRequirements = []
    pilotRequirements.append( ( 'CPUTime', taskQueueDict['CPUTime'] ) )
    # do we need to care about anything else?
    pilotRequirementsString = str( pilotRequirements )

    # Check that there are available queues for the Jobs:
    if self.enableListMatch:
      availableQueues = []
      # now = Time.dateTime()
      cachedAvailableQueues = self.listMatchCache.get( pilotRequirementsString )
      if cachedAvailableQueues == False:
        availableQueues = self._listQueues( pilotRequirements )
        if availableQueues != False:
          self.listMatchCache.add( pilotRequirementsString, self.listMatchDelay, availableQueues )
          self.log.verbose( 'Available Queues for TaskQueue ',  "%s: %s" % ( taskQueueID, str(availableQueues) ) )
      else:
        availableQueues = cachedAvailableQueues

    if not availableQueues:
      return S_ERROR( ERROR_CE + ' TQ: %d' % taskQueueID )

    baseDir = os.getcwd()
    workingDirectory = tempfile.mkdtemp( prefix= 'TQ_%s_' % taskQueueID, dir = workDir )
    self.log.verbose( 'Using working Directory:', workingDirectory )
    os.chdir( workingDirectory )

    # set the Site Name
    pilotOptions.append( "-n '%s'" % self.siteName)

    # submit pilots for every CE available
    
    for CE in self.computingElementDict.keys():
      ceName = CE
      computingElement = self.computingElementDict[CE]['CE']
      
      # add possible requirements from Site and CE
      for req, val in getResourceDict( ceName ).items():
        pilotOptions.append( "-o '/AgentJobRequirements/%s=%s'" % ( req, val ) )
        
      ceConfigDict = self.computingElementDict[CE]

      if 'ClientPlatform' in ceConfigDict:
        pilotOptions.append( "-p '%s'" % ceConfigDict['ClientPlatform'])
  
      if 'SharedArea' in ceConfigDict:
        pilotOptions.append( "-o '/LocalSite/SharedArea=%s'" % ceConfigDict['SharedArea'] )
  
      if 'CPUScalingFactor' in ceConfigDict:
        pilotOptions.append( "-o '/LocalSite/CPUScalingFactor=%s'" % ceConfigDict['CPUScalingFactor'] )

      if 'CPUNormalizationFactor' in ceConfigDict:
        pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % ceConfigDict['CPUNormalizationFactor'] )

        self.log.info( "pilotOptions: ", ' '.join(pilotOptions))

      httpProxy = ''
      if 'HttpProxy' in ceConfigDict:
        httpProxy = ceConfigDict['HttpProxy']
      
      pilotDir = ''
      if 'JobExecDir' in ceConfigDict:
        pilotExecDir = ceConfigDict['JobExecDir']
  
      try:
        pilotScript = self._writePilotScript( workingDirectory, pilotOptions, proxy, httpProxy, pilotExecDir )
      except:
        self.log.exception( ERROR_SCRIPT )
        try:
          os.chdir( baseDir )
          shutil.rmtree( workingDirectory )
        except:
          pass
        return S_ERROR( ERROR_SCRIPT )
  
      self.log.info("Pilots to submit: ", pilotsToSubmit)
      while submittedPilots < pilotsToSubmit:
        # Find out how many pilots can be submitted
        ret = computingElement.available( )
        if not ret['OK']:
          self.log.error('Can not determine if pilot should be submitted: ', ret['Message'])
          break
        maxPilotsToSubmit = ret['Value']
        self.log.info("Submit Pilots: ", maxPilotsToSubmit)
        if not maxPilotsToSubmit:
          break
        # submit the pilots and then check again
        for i in range( min(maxPilotsToSubmit,pilotsToSubmit-submittedPilots) ):
          submission = computingElement.submitJob(pilotScript, '', '')
          if not submission['OK']:
            self.log.error('Pilot submission failed: ', submission['Message'])
            # cleanup
            try:
              os.chdir( baseDir )
              shutil.rmtree( workingDirectory )
            except:
              pass
            return S_ERROR('Pilot submission failed after ' + str(submittedPilots) + ' pilots submitted successful')
          submittedPilots += 1
          # let the batch system some time to digest the submitted job
          time.sleep(1)
          
      #next CE

    try:
      os.chdir( baseDir )
      shutil.rmtree( workingDirectory )
    except:
      pass

    return S_OK(submittedPilots)

  def _listQueues( self, pilotRequirements ):
    """
     For each defined CE return the list of Queues with available, running and waiting slots,
     matching the requirements of the pilots.
     Currently only CPU time is considered
    """
    availableQueues = []
    result = self.computingElement.available( pilotRequirements )
    if not result['OK']:
      self.log.error( 'Can not determine available queues', result['Message'] )
      return False
    return result['Value']


  def _writePilotScript( self, workingDirectory, pilotOptions, proxy, httpProxy, pilotExecDir ):
    """
     Prepare the script to execute the pilot
     For the moment it will do like Grid Pilots, a full DIRAC installation

     It assumes that the pilot script will have access to the submit working directory
    """
    try:
      compressedAndEncodedProxy = base64.encodestring( bz2.compress( proxy.dumpAllToString()['Value'] ) ).replace('\n','')
      compressedAndEncodedPilot = base64.encodestring( bz2.compress( open( self.pilot, "rb" ).read(), 9 ) ).replace('\n','')
      compressedAndEncodedInstall = base64.encodestring( bz2.compress( open( self.install, "rb" ).read(), 9 ) ).replace('\n','')
    except:
      self.log.exception('Exception during file compression of proxy, dirac-pilot or dirac-install')
      return S_ERROR('Exception during file compression of proxy, dirac-pilot or dirac-install')

    localPilot = """#!/bin/bash
/usr/bin/env python << EOF
#
import os, tempfile, sys, shutil, base64, bz2
try:
  pilotExecDir = '%(pilotExecDir)s'
  if not pilotExecDir:
    pilotExecDir = None 
  pilotWorkingDirectory = tempfile.mkdtemp( suffix = 'pilot', prefix = 'DIRAC_', dir = pilotExecDir )
  os.chdir( pilotWorkingDirectory )
  open( 'proxy', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedProxy)s" ) ) )
  open( '%(pilotScript)s', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedPilot)s" ) ) )
  open( '%(installScript)s', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedInstall)s" ) ) )
  os.chmod("proxy",0600)
  os.chmod("%(pilotScript)s",0700)
  os.chmod("%(installScript)s",0700)
  if "LD_LIBRARY_PATH" not in os.environ:
    os.environ["LD_LIBRARY_PATH"]=""
  os.environ["X509_USER_PROXY"]=os.path.join(pilotWorkingDirectory, 'proxy')
  if "%(httpProxy)s":
    os.environ["HTTP_PROXY"]="%(httpProxy)s"
  os.environ["X509_CERT_DIR"]=os.path.join(pilotWorkingDirectory, 'etc/grid-security/certificates')
  # TODO: structure the output
  print '==========================================================='
  print 'Environment of execution host'
  for key in os.environ.keys():
    print key + '=' + os.environ[key]
  print '==========================================================='
except Exception, x:
  print >> sys.stderr, x
  sys.exit(-1)
cmd = "python %(pilotScript)s %(pilotOptions)s"
print 'Executing: ', cmd
sys.stdout.flush()
os.system( cmd )

shutil.rmtree( pilotWorkingDirectory )

EOF
""" % { 'compressedAndEncodedProxy': compressedAndEncodedProxy, 
        'compressedAndEncodedPilot': compressedAndEncodedPilot, 
        'compressedAndEncodedInstall': compressedAndEncodedInstall, 
        'httpProxy': httpProxy, 
        'pilotScript': os.path.basename(self.pilot), 
        'installScript': os.path.basename(self.install),
        'pilotOptions': ' '.join( pilotOptions ),
        'pilotExecDir': pilotExecDir }

    fd, name = tempfile.mkstemp( suffix = '_pilotwrapper.py', prefix = 'DIRAC_', dir=workingDirectory)
    pilotWrapper = os.fdopen(fd, 'w')
    pilotWrapper.write( localPilot )
    pilotWrapper.close()

    return name

  def _getPilotProxyFromDIRACGroup( self, ownerDN, ownerGroup, requiredTimeLeft ):
    """
    Download a limited pilot proxy with VOMS extensions depending on the group
    """
    #Assign VOMS attribute
    vomsAttr = CS.getVOMSAttributeForGroup( ownerGroup )
    if not vomsAttr:
      self.log.info( "Downloading a proxy without VOMS extensions" )
      return gProxyManager.downloadProxy( ownerDN, ownerGroup, limited = True,
                                          requiredTimeLeft = requiredTimeLeft )
    else:
      self.log.info( "Downloading a proxy with '%s' VOMS extension" % vomsAttr )
      return gProxyManager.downloadVOMSProxy( ownerDN,
                                     ownerGroup,
                                     limited = True,
                                     requiredTimeLeft = requiredTimeLeft,
                                     requiredVOMSAttribute = vomsAttr )
Example #14
0
class GridPilotDirector( PilotDirector ):
  """
    Base Grid PilotDirector class
    Derived classes must declare:
      self.Middleware: It must correspond to the string before "PilotDirector".
        (For proper naming of the logger)
      self.ResourceBrokers: list of Brokers used by the Director.
        (For proper error reporting)
  """
  def __init__( self, submitPool ):
    """
     Define some defaults and call parent __init__
    """
    self.gridEnv = GRIDENV

    self.cpuPowerRef = CPU_POWER_REF
    self.requirements = REQUIREMENTS
    self.rank = RANK
    self.fuzzyRank = FUZZY_RANK

    self.__failingWMSCache = DictCache()
    self.__ticketsWMSCache = DictCache()
    self.__listMatchWMSCache = DictCache()

    PilotDirector.__init__( self, submitPool )

  def configure( self, csSection, submitPool ):
    """
     Here goes common configuration for all Grid PilotDirectors
    """
    PilotDirector.configure( self, csSection, submitPool )
    self.reloadConfiguration( csSection, submitPool )

    self.__failingWMSCache.purgeExpired()
    self.__ticketsWMSCache.purgeExpired()
    for rb in self.__failingWMSCache.getKeys():
      if rb in self.resourceBrokers:
        try:
          self.resourceBrokers.remove( rb )
        except:
          pass

    self.resourceBrokers = List.randomize( self.resourceBrokers )

    if self.gridEnv:
      self.log.info( ' GridEnv:        ', self.gridEnv )
    if self.resourceBrokers:
      self.log.info( ' ResourceBrokers:', ', '.join( self.resourceBrokers ) )

  def configureFromSection( self, mySection ):
    """
      reload from CS
    """
    PilotDirector.configureFromSection( self, mySection )

    self.gridEnv = gConfig.getValue( mySection + '/GridEnv', self.gridEnv )
    if not self.gridEnv:
      # No specific option found, try a general one
      setup = gConfig.getValue( '/DIRAC/Setup', '' )
      if setup:
        instance = gConfig.getValue( '/DIRAC/Setups/%s/WorkloadManagement' % setup, '' )
        if instance:
          self.gridEnv = gConfig.getValue( '/Systems/WorkloadManagement/%s/GridEnv' % instance, '' )

    self.resourceBrokers = gConfig.getValue( mySection + '/ResourceBrokers'      , self.resourceBrokers )

    self.cpuPowerRef = gConfig.getValue( mySection + '/CPUPowerRef'           , self.cpuPowerRef )
    self.requirements = gConfig.getValue( mySection + '/Requirements'         , self.requirements )
    self.rank = gConfig.getValue( mySection + '/Rank'                 , self.rank )
    self.fuzzyRank = gConfig.getValue( mySection + '/FuzzyRank'            , self.fuzzyRank )

  def _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit,
                     ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ):
    """
      This method does the actual pilot submission to the Grid RB
      The logic is as follows:
      - If there are no available RB it return error
      - If there is no VOMS extension in the proxy, return error
      - It creates a temp directory
      - Prepare a JDL
        it has some part common to gLite and LCG (the payload description)
        it has some part specific to each middleware
    """
    taskQueueID = taskQueueDict['TaskQueueID']
    # ownerDN = taskQueueDict['OwnerDN']
    ownerDN = proxy.getCredentials()['Value']['identity']


    if not self.resourceBrokers:
      # Since we can exclude RBs from the list, it may become empty
      return S_ERROR( ERROR_RB )

    # Need to get VOMS extension for the later interactions with WMS
    ret = gProxyManager.getVOMSAttributes( proxy )
    if not ret['OK']:
      self.log.error( ERROR_VOMS, ret['Message'] )
      return S_ERROR( ERROR_VOMS )
    if not ret['Value']:
      return S_ERROR( ERROR_VOMS )
    vomsGroup = ret['Value'][0]

    workingDirectory = tempfile.mkdtemp( prefix = 'TQ_%s_' % taskQueueID, dir = workDir )
    self.log.verbose( 'Using working Directory:', workingDirectory )

    # Write JDL
    retDict = self._prepareJDL( taskQueueDict, workingDirectory, pilotOptions, pilotsPerJob,
                                ceMask, submitPrivatePilot, privateTQ )
    jdl = retDict['JDL']
    pilotRequirements = retDict['Requirements']
    rb = retDict['RB']
    if not jdl:
      try:
        shutil.rmtree( workingDirectory )
      except:
        pass
      return S_ERROR( ERROR_JDL )

    # Check that there are available queues for the Job:
    if self.enableListMatch:
      availableCEs = []
      now = Time.dateTime()
      availableCEs = self.listMatchCache.get( pilotRequirements )
      if availableCEs == False:
        availableCEs = self._listMatch( proxy, jdl, taskQueueID, rb )
        if availableCEs != False:
          self.log.verbose( 'LastListMatch', now )
          self.log.verbose( 'AvailableCEs ', availableCEs )
          self.listMatchCache.add( pilotRequirements, self.listMatchDelay * 60,
                                   value = availableCEs )                      # it is given in minutes
      if not availableCEs:
        try:
          shutil.rmtree( workingDirectory )
        except:
          pass
        return S_ERROR( ERROR_CE + ' TQ: %d' % taskQueueID )

    # Now we are ready for the actual submission, so

    self.log.verbose( 'Submitting Pilots for TaskQueue', taskQueueID )
    submitRet = self._submitPilot( proxy, pilotsPerJob, jdl, taskQueueID, rb )
    try:
      shutil.rmtree( workingDirectory )
    except:
      pass
    if not submitRet:
      return S_ERROR( 'Pilot Submission Failed for TQ %d ' % taskQueueID )
    # pilotReference, resourceBroker = submitRet

    submittedPilots = 0

    if pilotsPerJob != 1 and len( submitRet ) != pilotsPerJob:
      # Parametric jobs are used
      for pilotReference, resourceBroker in submitRet:
        pilotReference = self._getChildrenReferences( proxy, pilotReference, taskQueueID )
        submittedPilots += len( pilotReference )
        pilotAgentsDB.addPilotTQReference( pilotReference, taskQueueID, ownerDN,
                      vomsGroup, resourceBroker, self.gridMiddleware,
                      pilotRequirements )
    else:
      for pilotReference, resourceBroker in submitRet:
        pilotReference = [pilotReference]
        submittedPilots += len( pilotReference )
        pilotAgentsDB.addPilotTQReference( pilotReference, taskQueueID, ownerDN,
                      vomsGroup, broker = resourceBroker, gridType = self.gridMiddleware,
                      requirements = pilotRequirements )

    # add some sleep here
    time.sleep( 0.1 * submittedPilots )

    if pilotsToSubmit > pilotsPerJob:
      # Additional submissions are necessary, need to get a new token and iterate.
      pilotsToSubmit -= pilotsPerJob
      ownerDN = self.genericPilotDN
      ownerGroup = self.genericPilotGroup
      result = gProxyManager.requestToken( ownerDN, ownerGroup, max( pilotsToSubmit, self.maxJobsInFillMode ) )
      if not result[ 'OK' ]:
        self.log.error( ERROR_TOKEN, result['Message'] )
        return S_ERROR( ERROR_TOKEN )
      ( token, numberOfUses ) = result[ 'Value' ]
      for option in pilotOptions:
        if option.find( '-o /Security/ProxyToken=' ) == 0:
          pilotOptions.remove( option )
      pilotOptions.append( '-o /Security/ProxyToken=%s' % token )
      pilotsPerJob = min( pilotsPerJob, int( numberOfUses / self.maxJobsInFillMode ) )
      result = self._submitPilots( workDir, taskQueueDict, pilotOptions,
                                   pilotsToSubmit, ceMask,
                                   submitPrivatePilot, privateTQ,
                                   proxy, pilotsPerJob )
      if not result['OK']:
        result['Value'] = submittedPilots
        return result
      submittedPilots += result['Value']

    return S_OK( submittedPilots )

  def _prepareJDL( self, taskQueueDict, workingDirectory, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ ):
    """
      This method should be overridden in a subclass
    """
    self.log.error( '_prepareJDL() method should be implemented in a subclass' )
    sys.exit()

  def _JobJDL( self, taskQueueDict, pilotOptions, ceMask ):
    """
     The Job JDL is the same for LCG and GLite
    """
    pilotJDL = 'Executable     = "%s";\n' % os.path.basename( self.pilot )
    executable = self.pilot

    pilotJDL += 'Arguments     = "%s";\n' % ' '.join( pilotOptions )

    pilotJDL += 'CPUTimeRef    = %s;\n' % taskQueueDict['CPUTime']

    pilotJDL += 'CPUPowerRef   = %s;\n' % self.cpuPowerRef

    pilotJDL += """CPUWorkRef    = real( CPUTimeRef * CPUPowerRef );

Lookup        = "CPUScalingReferenceSI00=*";
cap = isList( other.GlueCECapability ) ? other.GlueCECapability : { "dummy" };
i0 = regexp( Lookup, cap[0] ) ? 0 : undefined;
i1 = isString( cap[1] ) && regexp( Lookup, cap[1] ) ? 1 : i0;
i2 = isString( cap[2] ) && regexp( Lookup, cap[2] ) ? 2 : i1;
i3 = isString( cap[3] ) && regexp( Lookup, cap[3] ) ? 3 : i2;
i4 = isString( cap[4] ) && regexp( Lookup, cap[4] ) ? 4 : i3;
i5 = isString( cap[5] ) && regexp( Lookup, cap[5] ) ? 5 : i4;
index = isString( cap[6] ) && regexp( Lookup, cap[6] ) ? 6 : i5;
i = isUndefined( index ) ? 0 : index;

QueuePowerRef = real( ! isUndefined( index ) ? int( substr( cap[i], size( Lookup ) - 1 ) ) : other.GlueHostBenchmarkSI00 );
QueueTimeRef  = real( other.GlueCEPolicyMaxCPUTime * 60 );
QueueWorkRef  = QueuePowerRef * QueueTimeRef;
"""

    requirements = list( self.requirements )
    if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']:
      # if there an explicit Grig CE requested by the TQ, remove the Ranking requirement
      for req in self.requirements:
        if req.strip().lower()[:6] == 'rank >':
          requirements.remove( req )

    requirements.append( 'QueueWorkRef > CPUWorkRef' )

    siteRequirements = '\n || '.join( [ 'other.GlueCEInfoHostName == "%s"' % s for s in ceMask ] )
    requirements.append( "( %s\n )" % siteRequirements )

    pilotRequirements = '\n && '.join( requirements )

    pilotJDL += 'pilotRequirements  = %s;\n' % pilotRequirements

    pilotJDL += 'Rank          = %s;\n' % self.rank
    pilotJDL += 'FuzzyRank     = %s;\n' % self.fuzzyRank
    pilotJDL += 'StdOutput     = "%s";\n' % outputSandboxFiles[0]
    pilotJDL += 'StdError      = "%s";\n' % outputSandboxFiles[1]

    pilotJDL += 'InputSandbox  = { "%s" };\n' % '", "'.join( [ self.install, executable ] )

    pilotJDL += 'OutputSandbox = { %s };\n' % ', '.join( [ '"%s"' % f for f in outputSandboxFiles ] )

    self.log.verbose( pilotJDL )

    return ( pilotJDL, pilotRequirements )


  def parseListMatchStdout( self, proxy, cmd, taskQueueID, rb ):
    """
      Parse List Match stdout to return list of matched CE's
    """
    self.log.verbose( 'Executing List Match for TaskQueue', taskQueueID )

    start = time.time()
    ret = executeGridCommand( proxy, cmd, self.gridEnv )

    if not ret['OK']:
      self.log.error( 'Failed to execute List Match:', ret['Message'] )
      self.__sendErrorMail( rb, 'List Match', cmd, ret, proxy )
      return False
    if ret['Value'][0] != 0:
      self.log.error( 'Error executing List Match:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
      self.__sendErrorMail( rb, 'List Match', cmd, ret, proxy )
      return False
    self.log.info( 'List Match Execution Time: %.2f for TaskQueue %d' % ( ( time.time() - start ), taskQueueID ) )

    stdout = ret['Value'][1]
    stderr = ret['Value'][2]
    availableCEs = []
    # Parse std.out
    for line in List.fromChar( stdout, '\n' ):
      if re.search( '/jobmanager-', line ) or re.search( '/cream-', line ):
        # TODO: the line has to be stripped from extra info
        availableCEs.append( line )

    if not availableCEs:
      self.log.info( 'List-Match failed to find CEs for TaskQueue', taskQueueID )
      self.log.info( stdout )
      self.log.info( stderr )
    else:
      self.log.debug( 'List-Match returns:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
      self.log.info( 'List-Match found %s CEs for TaskQueue' % len( availableCEs ), taskQueueID )
      self.log.verbose( ', '.join( availableCEs ) )


    return availableCEs

  def parseJobSubmitStdout( self, proxy, cmd, taskQueueID, rb ):
    """
      Parse Job Submit stdout to return pilot reference
    """
    start = time.time()
    self.log.verbose( 'Executing Job Submit for TaskQueue', taskQueueID )

    ret = executeGridCommand( proxy, cmd, self.gridEnv )

    if not ret['OK']:
      self.log.error( 'Failed to execute Job Submit:', ret['Message'] )
      self.__sendErrorMail( rb, 'Job Submit', cmd, ret, proxy )
      return False
    if ret['Value'][0] != 0:
      self.log.error( 'Error executing Job Submit:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
      self.__sendErrorMail( rb, 'Job Submit', cmd, ret, proxy )
      return False
    self.log.info( 'Job Submit Execution Time: %.2f for TaskQueue %d' % ( ( time.time() - start ), taskQueueID ) )

    stdout = ret['Value'][1]
    stderr = ret['Value'][2]

    submittedPilot = None

    failed = 1
    rb = ''
    for line in List.fromChar( stdout, '\n' ):
      m = re.search( "(https:\S+)", line )
      if ( m ):
        glite_id = m.group( 1 )
        submittedPilot = glite_id
        if not rb:
          m = re.search( "https://(.+):.+", glite_id )
          rb = m.group( 1 )
        failed = 0
    if failed:
      self.log.error( 'Job Submit returns no Reference:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
      return False

    self.log.info( 'Reference %s for TaskQueue %s' % ( glite_id, taskQueueID ) )

    return glite_id, rb

  def _writeJDL( self, filename, jdlList ):
    try:
      f = open( filename, 'w' )
      f.write( '\n'.join( jdlList ) )
      f.close()
    except Exception, x:
      self.log.exception()
      return ''

    return filename
Example #15
0
class PilotDirector:
    """
    Base Pilot Director class.
    Derived classes must implement:
      * __init__( self, submitPool ):
          that must call the parent class __init__ method and then do its own initialization
      * configure( self, csSection, submitPool ):
          that must call the parent class configure method and the do its own configuration
      * _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask,
                      submitPrivatePilot, privateTQ, proxy, pilotsPerJob )
          actual method doing the submission to the backend once the submitPilots method
          has prepared the common part

    Derived classes might implement:
      * configureFromSection( self, mySection ):
          to reload from a CS section the additional datamembers they might have defined.

    If additional datamembers are defined, they must:
      - be declared in the __init__
      - be reconfigured in the configureFromSection method by executing
        self.reloadConfiguration( csSection, submitPool ) in their configure method
  """
    gridMiddleware = ''

    def __init__(self, submitPool):
        """
     Define the logger and some defaults
    """

        if submitPool == self.gridMiddleware:
            self.log = gLogger.getSubLogger('%sPilotDirector' %
                                            self.gridMiddleware)
        else:
            self.log = gLogger.getSubLogger('%sPilotDirector/%s' %
                                            (self.gridMiddleware, submitPool))

        self.pilot = DIRAC_PILOT
        self.submitPoolOption = '-o /Resources/Computing/CEDefaults/SubmitPool=%s' % submitPool
        self.extraPilotOptions = []
        self.installVersion = DIRAC_VERSION
        self.installProject = DIRAC_PROJECT
        self.installation = DIRAC_INSTALLATION

        self.virtualOrganization = VIRTUAL_ORGANIZATION
        self.install = DIRAC_INSTALL
        self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE
        self.targetGrids = [self.gridMiddleware]

        self.enableListMatch = ENABLE_LISTMATCH
        self.listMatchDelay = LISTMATCH_DELAY
        self.listMatchCache = DictCache()

        self.privatePilotFraction = PRIVATE_PILOT_FRACTION

        self.errorClearTime = ERROR_CLEAR_TIME
        self.errorTicketTime = ERROR_TICKET_TIME
        self.errorMailAddress = DIRAC.errorMail
        self.alarmMailAddress = DIRAC.alarmMail
        self.mailFromAddress = FROM_MAIL

        if not 'log' in self.__dict__:
            self.log = gLogger.getSubLogger('PilotDirector')
        self.log.info('Initialized')

    def configure(self, csSection, submitPool):
        """
     Here goes common configuration for all PilotDirectors
    """
        self.configureFromSection(csSection)
        self.reloadConfiguration(csSection, submitPool)

        # Get the defaults for the Setup where the Director is running
        opsHelper = Operations()
        self.installVersion = opsHelper.getValue(cfgPath('Pilot', 'Version'),
                                                 [self.installVersion])[0]
        self.installProject = opsHelper.getValue(cfgPath('Pilot', 'Project'),
                                                 self.installProject)
        self.installation = opsHelper.getValue(
            cfgPath('Pilot', 'Installation'), self.installation)

        self.log.info('===============================================')
        self.log.info('Configuration:')
        self.log.info('')
        self.log.info(' Target Grids:   ', ', '.join(self.targetGrids))
        self.log.info(' Install script: ', self.install)
        self.log.info(' Pilot script:   ', self.pilot)
        self.log.info(' Install Ver:    ', self.installVersion)
        if self.installProject:
            self.log.info(' Project:        ', self.installProject)
        if self.installation:
            self.log.info(' Installation:   ', self.installation)
        if self.extraPilotOptions:
            self.log.info(' Extra Options:   ',
                          ' '.join(self.extraPilotOptions))
        self.log.info(' ListMatch:      ', self.enableListMatch)
        self.log.info(' Private %:      ', self.privatePilotFraction * 100)
        if self.enableListMatch:
            self.log.info(' ListMatch Delay:', self.listMatchDelay)
        self.listMatchCache.purgeExpired()

    def reloadConfiguration(self, csSection, submitPool):
        """
     Common Configuration can be overwriten for each GridMiddleware
    """
        mySection = csSection + '/' + self.gridMiddleware
        self.configureFromSection(mySection)

        # And Again for each SubmitPool
        mySection = csSection + '/' + submitPool
        self.configureFromSection(mySection)

    def configureFromSection(self, mySection):
        """
      reload from CS
    """
        self.pilot = gConfig.getValue(mySection + '/PilotScript', self.pilot)
        #TODO: Remove this DIRACVersion after 06/2012
        self.installVersion = gConfig.getValue(mySection + '/DIRACVersion',
                                               self.installVersion)
        self.installVersion = gConfig.getValue(mySection + '/Version',
                                               self.installVersion)
        self.extraPilotOptions = gConfig.getValue(
            mySection + '/ExtraPilotOptions', self.extraPilotOptions)
        self.install = gConfig.getValue(mySection + '/InstallScript',
                                        self.install)
        self.installProject = gConfig.getValue(mySection + '/Project',
                                               self.installProject)
        self.installation = gConfig.getValue(mySection + '/Installation',
                                             self.installation)
        self.maxJobsInFillMode = gConfig.getValue(
            mySection + '/MaxJobsInFillMode', self.maxJobsInFillMode)
        self.targetGrids = gConfig.getValue(mySection + '/TargetGrids',
                                            self.targetGrids)

        self.enableListMatch = gConfig.getValue(mySection + '/EnableListMatch',
                                                self.enableListMatch)
        self.listMatchDelay = gConfig.getValue(mySection + '/ListMatchDelay',
                                               self.listMatchDelay)
        self.errorClearTime = gConfig.getValue(mySection + '/ErrorClearTime',
                                               self.errorClearTime)
        self.errorTicketTime = gConfig.getValue(mySection + '/ErrorTicketTime',
                                                self.errorTicketTime)
        self.errorMailAddress = gConfig.getValue(
            mySection + '/ErrorMailAddress', self.errorMailAddress)
        self.alarmMailAddress = gConfig.getValue(
            mySection + '/AlarmMailAddress', self.alarmMailAddress)
        self.mailFromAddress = gConfig.getValue(mySection + '/MailFromAddress',
                                                self.mailFromAddress)
        self.privatePilotFraction = gConfig.getValue(
            mySection + '/PrivatePilotFraction', self.privatePilotFraction)

        virtualOrganization = gConfig.getValue(
            mySection + '/VirtualOrganization', '')
        if not virtualOrganization:
            virtualOrganization = getVOForGroup('NonExistingGroup')
            if not virtualOrganization:
                virtualOrganization = self.virtualOrganization
        self.virtualOrganization = virtualOrganization

    def _resolveCECandidates(self, taskQueueDict):
        """
      Return a list of CEs for this TaskQueue
    """
        # assume user knows what they're doing and avoid site mask e.g. sam jobs
        if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']:
            self.log.info(
                'CEs requested by TaskQueue %s:' %
                taskQueueDict['TaskQueueID'],
                ', '.join(taskQueueDict['GridCEs']))
            return taskQueueDict['GridCEs']

        # Get the mask
        ret = jobDB.getSiteMask()
        if not ret['OK']:
            self.log.error('Can not retrieve site Mask from DB:',
                           ret['Message'])
            return []

        siteMask = ret['Value']
        if not siteMask:
            self.log.error('Site mask is empty')
            return []

        self.log.verbose('Site Mask: %s' % ', '.join(siteMask))

        # remove banned sites from siteMask
        if 'BannedSites' in taskQueueDict:
            for site in taskQueueDict['BannedSites']:
                if site in siteMask:
                    siteMask.remove(site)
                    self.log.verbose('Removing banned site %s from site Mask' %
                                     site)

        # remove from the mask if a Site is given
        siteMask = [
            site for site in siteMask
            if 'Sites' not in taskQueueDict or site in taskQueueDict['Sites']
        ]

        if not siteMask:
            # pilot can not be submitted
            self.log.info('No Valid Site Candidate in Mask for TaskQueue %s' %
                          taskQueueDict['TaskQueueID'])
            return []

        self.log.info(
            'Site Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'],
            ', '.join(siteMask))

        # Get CE's associates to the given site Names
        ceMask = []

        for grid in self.targetGrids:

            section = '/Resources/Sites/%s' % grid
            ret = gConfig.getSections(section)
            if not ret['OK']:
                # this is hack, maintained until LCG is added as TargetGrid for the gLite SubmitPool
                section = '/Resources/Sites/LCG'
                ret = gConfig.getSections(section)

            if not ret['OK']:
                self.log.error('Could not obtain CEs from CS', ret['Message'])
                continue

            gridSites = ret['Value']
            for siteName in gridSites:
                if siteName in siteMask:
                    ret = gConfig.getValue('%s/%s/CE' % (section, siteName),
                                           [])
                    for ce in ret:
                        submissionMode = gConfig.getValue(
                            '%s/%s/CEs/%s/SubmissionMode' %
                            (section, siteName, ce), 'gLite')
                        if submissionMode == self.gridMiddleware and ce not in ceMask:
                            ceMask.append(ce)

        if not ceMask:
            self.log.info(
                'No CE Candidate found for TaskQueue %s:' %
                taskQueueDict['TaskQueueID'], ', '.join(siteMask))

        self.log.verbose(
            'CE Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'],
            ', '.join(ceMask))

        return ceMask

    def _getPilotOptions(self, taskQueueDict, pilotsToSubmit):

        # Need to limit the maximum number of pilots to submit at once
        # For generic pilots this is limited by the number of use of the tokens and the
        # maximum number of jobs in Filling mode, but for private Jobs we need an extra limitation:
        pilotsToSubmit = max(
            min(pilotsToSubmit, int(50 / self.maxJobsInFillMode)), 1)
        pilotOptions = []
        privateIfGenericTQ = self.privatePilotFraction > random.random()
        privateTQ = ('PilotTypes' in taskQueueDict and 'private'
                     in [t.lower() for t in taskQueueDict['PilotTypes']])
        forceGeneric = 'ForceGeneric' in taskQueueDict
        submitPrivatePilot = (privateIfGenericTQ
                              or privateTQ) and not forceGeneric
        if submitPrivatePilot:
            self.log.verbose('Submitting private pilots for TaskQueue %s' %
                             taskQueueDict['TaskQueueID'])
            ownerDN = taskQueueDict['OwnerDN']
            ownerGroup = taskQueueDict['OwnerGroup']
            # User Group requirement
            pilotOptions.append('-G %s' % taskQueueDict['OwnerGroup'])
            # check if group allows jobsharing
            ownerGroupProperties = getPropertiesForGroup(ownerGroup)
            if not 'JobSharing' in ownerGroupProperties:
                # Add Owner requirement to pilot
                pilotOptions.append("-O '%s'" % ownerDN)
            if privateTQ:
                pilotOptions.append(
                    '-o /Resources/Computing/CEDefaults/PilotType=private')
            maxJobsInFillMode = self.maxJobsInFillMode
        else:
            #For generic jobs we'll submit mixture of generic and private pilots
            self.log.verbose('Submitting generic pilots for TaskQueue %s' %
                             taskQueueDict['TaskQueueID'])
            #ADRI: Find the generic group
            result = findGenericPilotCredentials(
                group=taskQueueDict['OwnerGroup'])
            if not result['OK']:
                self.log.error(ERROR_GENERIC_CREDENTIALS, result['Message'])
                return S_ERROR(ERROR_GENERIC_CREDENTIALS)
            ownerDN, ownerGroup = result['Value']

            result = gProxyManager.requestToken(
                ownerDN, ownerGroup, max(pilotsToSubmit,
                                         self.maxJobsInFillMode))
            if not result['OK']:
                self.log.error(ERROR_TOKEN, result['Message'])
                return S_ERROR(ERROR_TOKEN)
            (token, numberOfUses) = result['Value']
            pilotsToSubmit = min(numberOfUses, pilotsToSubmit)

            pilotOptions.append('-o /Security/ProxyToken=%s' % token)

            pilotsToSubmit = max(
                1, (pilotsToSubmit - 1) / self.maxJobsInFillMode + 1)

            maxJobsInFillMode = int(numberOfUses / pilotsToSubmit)
        # Use Filling mode
        pilotOptions.append('-M %s' % maxJobsInFillMode)

        # Debug
        pilotOptions.append('-d')
        # Setup.
        pilotOptions.append('-S %s' % taskQueueDict['Setup'])
        # CS Servers
        csServers = gConfig.getServersList()
        if len(csServers) > 3:
            # Remove the master
            master = gConfigurationData.getMasterServer()
            if master in csServers:
                csServers.remove(master)
        pilotOptions.append('-C %s' % ",".join(csServers))
        # DIRAC Extensions
        extensionsList = getCSExtensions()
        if extensionsList:
            pilotOptions.append('-e %s' % ",".join(extensionsList))
        #Get DIRAC version and project, There might be global Setup defaults and per VO/Setup defaults (from configure)
        opsHelper = Operations(group=taskQueueDict['OwnerGroup'],
                               setup=taskQueueDict['Setup'])
        # Requested version of DIRAC (it can be a list, so we take the fist one)
        version = opsHelper.getValue(cfgPath('Pilot', 'Version'),
                                     [self.installVersion])[0]
        pilotOptions.append('-r %s' % version)
        # Requested Project to install
        installProject = opsHelper.getValue(cfgPath('Pilot', 'Project'),
                                            self.installProject)
        if installProject:
            pilotOptions.append('-l %s' % installProject)
        installation = opsHelper.getValue(cfgPath('Pilot', 'Installation'),
                                          self.installation)
        if installation:
            pilotOptions.append("-V %s" % installation)
        # Requested CPU time
        pilotOptions.append('-T %s' % taskQueueDict['CPUTime'])

        if self.submitPoolOption not in self.extraPilotOptions:
            pilotOptions.append(self.submitPoolOption)

        if self.extraPilotOptions:
            pilotOptions.extend(self.extraPilotOptions)

        return S_OK((pilotOptions, pilotsToSubmit, ownerDN, ownerGroup,
                     submitPrivatePilot, privateTQ))

    def _submitPilots(self, workDir, taskQueueDict, pilotOptions,
                      pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ,
                      proxy, pilotsPerJob):
        """
      This method must be implemented on the Backend specific derived class.
      This is problem with the Director, not with the Job so we must return S_OK
      Return S_ERROR if not defined.
    """
        self.log.error('_submitPilots method not implemented')
        return S_OK()

    def submitPilots(self, taskQueueDict, pilotsToSubmit, workDir=None):
        """
      Submit pilot for the given TaskQueue,
      this method just insert the request in the corresponding ThreadPool,
      the submission is done from the Thread Pool job
    """
        try:

            taskQueueID = taskQueueDict['TaskQueueID']

            self.log.verbose('Submitting Pilot')
            ceMask = self._resolveCECandidates(taskQueueDict)
            if not ceMask:
                return S_ERROR('No CE available for TaskQueue %d' %
                               int(taskQueueID))
            result = self._getPilotOptions(taskQueueDict, pilotsToSubmit)
            if not result['OK']:
                return result
            (pilotOptions, pilotsPerJob, ownerDN, ownerGroup,
             submitPrivatePilot, privateTQ) = result['Value']
            # get a valid proxy, submit with a long proxy to avoid renewal
            ret = self._getPilotProxyFromDIRACGroup(ownerDN,
                                                    ownerGroup,
                                                    requiredTimeLeft=86400 * 5)
            if not ret['OK']:
                self.log.error(ret['Message'])
                self.log.error('No proxy Available',
                               'User "%s", Group "%s"' % (ownerDN, ownerGroup))
                return S_ERROR(ERROR_PROXY)
            proxy = ret['Value']
            # Now call a Grid Specific method to handle the final submission of the pilots
            return self._submitPilots(workDir, taskQueueDict, pilotOptions,
                                      pilotsToSubmit, ceMask,
                                      submitPrivatePilot, privateTQ, proxy,
                                      pilotsPerJob)

        except Exception:
            self.log.exception('Error in Pilot Submission')

        return S_OK(0)

    def _getPilotProxyFromDIRACGroup(self, ownerDN, ownerGroup,
                                     requiredTimeLeft):
        """
     To be overwritten if a given Pilot does not require a full proxy
    """
        self.log.info("Downloading %s@%s proxy" % (ownerDN, ownerGroup))
        return gProxyManager.getPilotProxyFromDIRACGroup(
            ownerDN, ownerGroup, requiredTimeLeft)

    def exceptionCallBack(self, threadedJob, exceptionInfo):
        self.log.exception('Error in Pilot Submission')
Example #16
0
class PilotDirector:
    """
    Base Pilot Director class.
    Derived classes must implement:
      * __init__( self, submitPool ):
          that must call the parent class __init__ method and then do its own initialization
      * configure( self, csSection, submitPool ):
          that must call the parent class configure method and the do its own configuration
      * _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask,
                      submitPrivatePilot, privateTQ, proxy, pilotsPerJob )
          actual method doing the submission to the backend once the submitPilots method
          has prepared the common part

    Derived classes might implement:
      * configureFromSection( self, mySection ):
          to reload from a CS section the additional datamembers they might have defined.

    If additional datamembers are defined, they must:
      - be declared in the __init__
      - be reconfigured in the configureFromSection method by executing
        self.reloadConfiguration( csSection, submitPool ) in their configure method
  """

    gridMiddleware = ""

    def __init__(self, submitPool):
        """
     Define the logger and some defaults
    """

        if submitPool == self.gridMiddleware:
            self.log = gLogger.getSubLogger("%sPilotDirector" % self.gridMiddleware)
        else:
            self.log = gLogger.getSubLogger("%sPilotDirector/%s" % (self.gridMiddleware, submitPool))

        self.pilot = DIRAC_PILOT
        self.submitPoolOption = "-o /Resources/Computing/CEDefaults/SubmitPool=%s" % submitPool
        self.extraPilotOptions = []
        self.installVersion = DIRAC_VERSION
        self.installProject = DIRAC_PROJECT
        self.installation = DIRAC_INSTALLATION

        self.virtualOrganization = VIRTUAL_ORGANIZATION
        self.install = DIRAC_INSTALL
        self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE
        self.targetGrids = [self.gridMiddleware]

        self.enableListMatch = ENABLE_LISTMATCH
        self.listMatchDelay = LISTMATCH_DELAY
        self.listMatchCache = DictCache()

        self.privatePilotFraction = PRIVATE_PILOT_FRACTION

        self.errorClearTime = ERROR_CLEAR_TIME
        self.errorTicketTime = ERROR_TICKET_TIME
        self.errorMailAddress = DIRAC.errorMail
        self.alarmMailAddress = DIRAC.alarmMail
        self.mailFromAddress = FROM_MAIL

        if not "log" in self.__dict__:
            self.log = gLogger.getSubLogger("PilotDirector")
        self.log.info("Initialized")

    def configure(self, csSection, submitPool):
        """
     Here goes common configuration for all PilotDirectors
    """
        self.configureFromSection(csSection)
        self.reloadConfiguration(csSection, submitPool)

        # Get the defaults for the Setup where the Director is running
        opsHelper = Operations()
        self.installVersion = opsHelper.getValue(cfgPath("Pilot", "Version"), [self.installVersion])[0]
        self.installProject = opsHelper.getValue(cfgPath("Pilot", "Project"), self.installProject)
        self.installation = opsHelper.getValue(cfgPath("Pilot", "Installation"), self.installation)

        self.log.info("===============================================")
        self.log.info("Configuration:")
        self.log.info("")
        self.log.info(" Target Grids:   ", ", ".join(self.targetGrids))
        self.log.info(" Install script: ", self.install)
        self.log.info(" Pilot script:   ", self.pilot)
        self.log.info(" Install Ver:    ", self.installVersion)
        if self.installProject:
            self.log.info(" Project:        ", self.installProject)
        if self.installation:
            self.log.info(" Installation:   ", self.installation)
        if self.extraPilotOptions:
            self.log.info(" Extra Options:   ", " ".join(self.extraPilotOptions))
        self.log.info(" ListMatch:      ", self.enableListMatch)
        self.log.info(" Private %:      ", self.privatePilotFraction * 100)
        if self.enableListMatch:
            self.log.info(" ListMatch Delay:", self.listMatchDelay)
        self.listMatchCache.purgeExpired()

    def reloadConfiguration(self, csSection, submitPool):
        """
     Common Configuration can be overwriten for each GridMiddleware
    """
        mySection = csSection + "/" + self.gridMiddleware
        self.configureFromSection(mySection)

        # And Again for each SubmitPool
        mySection = csSection + "/" + submitPool
        self.configureFromSection(mySection)

    def configureFromSection(self, mySection):
        """
      reload from CS
    """
        self.pilot = gConfig.getValue(mySection + "/PilotScript", self.pilot)
        # TODO: Remove this DIRACVersion after 06/2012
        self.installVersion = gConfig.getValue(mySection + "/DIRACVersion", self.installVersion)
        self.installVersion = gConfig.getValue(mySection + "/Version", self.installVersion)
        self.extraPilotOptions = gConfig.getValue(mySection + "/ExtraPilotOptions", self.extraPilotOptions)
        self.install = gConfig.getValue(mySection + "/InstallScript", self.install)
        self.installProject = gConfig.getValue(mySection + "/Project", self.installProject)
        self.installation = gConfig.getValue(mySection + "/Installation", self.installation)
        self.maxJobsInFillMode = gConfig.getValue(mySection + "/MaxJobsInFillMode", self.maxJobsInFillMode)
        self.targetGrids = gConfig.getValue(mySection + "/TargetGrids", self.targetGrids)

        self.enableListMatch = gConfig.getValue(mySection + "/EnableListMatch", self.enableListMatch)
        self.listMatchDelay = gConfig.getValue(mySection + "/ListMatchDelay", self.listMatchDelay)
        self.errorClearTime = gConfig.getValue(mySection + "/ErrorClearTime", self.errorClearTime)
        self.errorTicketTime = gConfig.getValue(mySection + "/ErrorTicketTime", self.errorTicketTime)
        self.errorMailAddress = gConfig.getValue(mySection + "/ErrorMailAddress", self.errorMailAddress)
        self.alarmMailAddress = gConfig.getValue(mySection + "/AlarmMailAddress", self.alarmMailAddress)
        self.mailFromAddress = gConfig.getValue(mySection + "/MailFromAddress", self.mailFromAddress)
        self.privatePilotFraction = gConfig.getValue(mySection + "/PrivatePilotFraction", self.privatePilotFraction)

        virtualOrganization = gConfig.getValue(mySection + "/VirtualOrganization", "")
        if not virtualOrganization:
            virtualOrganization = getVOForGroup("NonExistingGroup")
            if not virtualOrganization:
                virtualOrganization = self.virtualOrganization
        self.virtualOrganization = virtualOrganization

    def _resolveCECandidates(self, taskQueueDict):
        """
      Return a list of CEs for this TaskQueue
    """
        # assume user knows what they're doing and avoid site mask e.g. sam jobs
        if "GridCEs" in taskQueueDict and taskQueueDict["GridCEs"]:
            self.log.info(
                "CEs requested by TaskQueue %s:" % taskQueueDict["TaskQueueID"], ", ".join(taskQueueDict["GridCEs"])
            )
            return taskQueueDict["GridCEs"]

        # Get the mask
        ret = jobDB.getSiteMask()
        if not ret["OK"]:
            self.log.error("Can not retrieve site Mask from DB:", ret["Message"])
            return []

        siteMask = ret["Value"]
        if not siteMask:
            self.log.error("Site mask is empty")
            return []

        self.log.verbose("Site Mask: %s" % ", ".join(siteMask))

        # remove banned sites from siteMask
        if "BannedSites" in taskQueueDict:
            for site in taskQueueDict["BannedSites"]:
                if site in siteMask:
                    siteMask.remove(site)
                    self.log.verbose("Removing banned site %s from site Mask" % site)

        # remove from the mask if a Site is given
        siteMask = [site for site in siteMask if "Sites" not in taskQueueDict or site in taskQueueDict["Sites"]]

        if not siteMask:
            # pilot can not be submitted
            self.log.info("No Valid Site Candidate in Mask for TaskQueue %s" % taskQueueDict["TaskQueueID"])
            return []

        self.log.info("Site Candidates for TaskQueue %s:" % taskQueueDict["TaskQueueID"], ", ".join(siteMask))

        # Get CE's associates to the given site Names
        ceMask = []

        for grid in self.targetGrids:

            section = "/Resources/Sites/%s" % grid
            ret = gConfig.getSections(section)
            if not ret["OK"]:
                # this is hack, maintained until LCG is added as TargetGrid for the gLite SubmitPool
                section = "/Resources/Sites/LCG"
                ret = gConfig.getSections(section)

            if not ret["OK"]:
                self.log.error("Could not obtain CEs from CS", ret["Message"])
                continue

            gridSites = ret["Value"]
            for siteName in gridSites:
                if siteName in siteMask:
                    ret = gConfig.getValue("%s/%s/CE" % (section, siteName), [])
                    for ce in ret:
                        submissionMode = gConfig.getValue(
                            "%s/%s/CEs/%s/SubmissionMode" % (section, siteName, ce), "gLite"
                        )
                        if submissionMode == self.gridMiddleware and ce not in ceMask:
                            ceMask.append(ce)

        if not ceMask:
            self.log.info("No CE Candidate found for TaskQueue %s:" % taskQueueDict["TaskQueueID"], ", ".join(siteMask))

        self.log.verbose("CE Candidates for TaskQueue %s:" % taskQueueDict["TaskQueueID"], ", ".join(ceMask))

        return ceMask

    def _getPilotOptions(self, taskQueueDict, pilotsToSubmit):

        # Need to limit the maximum number of pilots to submit at once
        # For generic pilots this is limited by the number of use of the tokens and the
        # maximum number of jobs in Filling mode, but for private Jobs we need an extra limitation:
        pilotsToSubmit = max(min(pilotsToSubmit, int(50 / self.maxJobsInFillMode)), 1)
        pilotOptions = []
        privateIfGenericTQ = self.privatePilotFraction > random.random()
        privateTQ = "PilotTypes" in taskQueueDict and "private" in [t.lower() for t in taskQueueDict["PilotTypes"]]
        forceGeneric = "ForceGeneric" in taskQueueDict
        submitPrivatePilot = (privateIfGenericTQ or privateTQ) and not forceGeneric
        if submitPrivatePilot:
            self.log.verbose("Submitting private pilots for TaskQueue %s" % taskQueueDict["TaskQueueID"])
            ownerDN = taskQueueDict["OwnerDN"]
            ownerGroup = taskQueueDict["OwnerGroup"]
            # User Group requirement
            pilotOptions.append("-G %s" % taskQueueDict["OwnerGroup"])
            # check if group allows jobsharing
            ownerGroupProperties = getPropertiesForGroup(ownerGroup)
            if not "JobSharing" in ownerGroupProperties:
                # Add Owner requirement to pilot
                pilotOptions.append("-O '%s'" % ownerDN)
            if privateTQ:
                pilotOptions.append("-o /Resources/Computing/CEDefaults/PilotType=private")
            maxJobsInFillMode = self.maxJobsInFillMode
        else:
            # For generic jobs we'll submit mixture of generic and private pilots
            self.log.verbose("Submitting generic pilots for TaskQueue %s" % taskQueueDict["TaskQueueID"])
            # ADRI: Find the generic group
            result = findGenericPilotCredentials(group=taskQueueDict["OwnerGroup"])
            if not result["OK"]:
                self.log.error(ERROR_GENERIC_CREDENTIALS, result["Message"])
                return S_ERROR(ERROR_GENERIC_CREDENTIALS)
            ownerDN, ownerGroup = result["Value"]

            result = gProxyManager.requestToken(ownerDN, ownerGroup, max(pilotsToSubmit, self.maxJobsInFillMode))
            if not result["OK"]:
                self.log.error(ERROR_TOKEN, result["Message"])
                return S_ERROR(ERROR_TOKEN)
            (token, numberOfUses) = result["Value"]
            pilotsToSubmit = min(numberOfUses, pilotsToSubmit)

            pilotOptions.append("-o /Security/ProxyToken=%s" % token)

            pilotsToSubmit = max(1, (pilotsToSubmit - 1) / self.maxJobsInFillMode + 1)

            maxJobsInFillMode = int(numberOfUses / pilotsToSubmit)
        # Use Filling mode
        pilotOptions.append("-M %s" % maxJobsInFillMode)

        # Debug
        pilotOptions.append("-d")
        # Setup.
        pilotOptions.append("-S %s" % taskQueueDict["Setup"])
        # CS Servers
        csServers = gConfig.getServersList()
        if len(csServers) > 3:
            # Remove the master
            master = gConfigurationData.getMasterServer()
            if master in csServers:
                csServers.remove(master)
        pilotOptions.append("-C %s" % ",".join(csServers))
        # DIRAC Extensions
        extensionsList = getCSExtensions()
        if extensionsList:
            pilotOptions.append("-e %s" % ",".join(extensionsList))
        # Get DIRAC version and project, There might be global Setup defaults and per VO/Setup defaults (from configure)
        opsHelper = Operations(group=taskQueueDict["OwnerGroup"], setup=taskQueueDict["Setup"])
        # Requested version of DIRAC (it can be a list, so we take the fist one)
        version = opsHelper.getValue(cfgPath("Pilot", "Version"), [self.installVersion])[0]
        pilotOptions.append("-r %s" % version)
        # Requested Project to install
        installProject = opsHelper.getValue(cfgPath("Pilot", "Project"), self.installProject)
        if installProject:
            pilotOptions.append("-l %s" % installProject)
        installation = opsHelper.getValue(cfgPath("Pilot", "Installation"), self.installation)
        if installation:
            pilotOptions.append("-V %s" % installation)
        # Requested CPU time
        pilotOptions.append("-T %s" % taskQueueDict["CPUTime"])

        if self.submitPoolOption not in self.extraPilotOptions:
            pilotOptions.append(self.submitPoolOption)

        if self.extraPilotOptions:
            pilotOptions.extend(self.extraPilotOptions)

        return S_OK((pilotOptions, pilotsToSubmit, ownerDN, ownerGroup, submitPrivatePilot, privateTQ))

    def _submitPilots(
        self,
        workDir,
        taskQueueDict,
        pilotOptions,
        pilotsToSubmit,
        ceMask,
        submitPrivatePilot,
        privateTQ,
        proxy,
        pilotsPerJob,
    ):
        """
      This method must be implemented on the Backend specific derived class.
      This is problem with the Director, not with the Job so we must return S_OK
      Return S_ERROR if not defined.
    """
        self.log.error("_submitPilots method not implemented")
        return S_OK()

    def submitPilots(self, taskQueueDict, pilotsToSubmit, workDir=None):
        """
      Submit pilot for the given TaskQueue,
      this method just insert the request in the corresponding ThreadPool,
      the submission is done from the Thread Pool job
    """
        try:

            taskQueueID = taskQueueDict["TaskQueueID"]

            self.log.verbose("Submitting Pilot")
            ceMask = self._resolveCECandidates(taskQueueDict)
            if not ceMask:
                return S_ERROR("No CE available for TaskQueue %d" % int(taskQueueID))
            result = self._getPilotOptions(taskQueueDict, pilotsToSubmit)
            if not result["OK"]:
                return result
            (pilotOptions, pilotsPerJob, ownerDN, ownerGroup, submitPrivatePilot, privateTQ) = result["Value"]
            # get a valid proxy, submit with a long proxy to avoid renewal
            ret = self._getPilotProxyFromDIRACGroup(ownerDN, ownerGroup, requiredTimeLeft=86400 * 5)
            if not ret["OK"]:
                self.log.error(ret["Message"])
                self.log.error("No proxy Available", 'User "%s", Group "%s"' % (ownerDN, ownerGroup))
                return S_ERROR(ERROR_PROXY)
            proxy = ret["Value"]
            # Now call a Grid Specific method to handle the final submission of the pilots
            return self._submitPilots(
                workDir,
                taskQueueDict,
                pilotOptions,
                pilotsToSubmit,
                ceMask,
                submitPrivatePilot,
                privateTQ,
                proxy,
                pilotsPerJob,
            )

        except Exception:
            self.log.exception("Error in Pilot Submission")

        return S_OK(0)

    def _getPilotProxyFromDIRACGroup(self, ownerDN, ownerGroup, requiredTimeLeft):
        """
     To be overwritten if a given Pilot does not require a full proxy
    """
        self.log.info("Downloading %s@%s proxy" % (ownerDN, ownerGroup))
        return gProxyManager.getPilotProxyFromDIRACGroup(ownerDN, ownerGroup, requiredTimeLeft)

    def exceptionCallBack(self, threadedJob, exceptionInfo):
        self.log.exception("Error in Pilot Submission")