def __getJobSiteRequirement( self, job, classAdJob ):
    """Returns any candidate sites specified by the job or sites that have been
       banned and could affect the scheduling decision.
    """

    result = self.jobDB.getJobAttribute( job, 'Site' )
    if not result['OK']:
      site = []
    else:
      site = List.fromChar( result['Value'] )

    result = S_OK()

    bannedSites = classAdJob.getAttributeString( 'BannedSites' )
    bannedSites = bannedSites.replace( '{', '' ).replace( '}', '' )
    bannedSites = List.fromChar( bannedSites )

    if not 'ANY' in site and not 'Unknown' in site and not 'Multiple' in site:
      if len( site ) == 1:
        self.log.info( 'Job %s has single chosen site %s specified in JDL' % ( job, site[0] ) )
      result['Sites'] = site
    elif 'Multiple' in site:
      result['Sites'] = classAdJob.getListFromExpression( 'Site' )
    else:
      result['Sites'] = []

    if bannedSites:
      self.log.info( 'Job %s has JDL requirement to ban %s' % ( job, bannedSites ) )
      result['BannedSites'] = bannedSites
    else:
      result['BannedSites'] = []

    return result
Example #2
0
  def checkJob( self, job, classAdJob ):
    """This method controls the checking of the job.
    """
    jobDesc = JobDescription()
    result = jobDesc.loadDescription( classAdJob.asJDL() )
    if not result[ 'OK' ]:
      self.setFailedJob( job, result['Message'], classAdJob )
      return result
    self.__syncJobDesc( job, jobDesc, classAdJob )

    #Check if job defines a path itself
    # FIXME: only some group might be able to overwrite the jobPath
    jobPath = classAdJob.get_expression( 'JobPath' ).replace( '"', '' ).replace( 'Unknown', '' )
    #jobPath = jobDesc.getVarWithDefault( 'JobPath' ).replace( 'Unknown', '' )
    if jobPath:
      # HACK: Remove the { and } to ensure we have a simple string
      jobPath = jobPath.replace( "{", "" ).replace( "}", "" )
      self.log.info( 'Job %s defines its own optimizer chain %s' % ( job, jobPath ) )
      return self.processJob( job, List.fromChar( jobPath ) )

    #If no path, construct based on JDL and VO path module if present
    path = list( self.basePath )
    if self.voPlugin:
      argumentsDict = {'JobID':job, 'ClassAd':classAdJob, 'ConfigPath':self.am_getModuleParam( "section" )}
      moduleFactory = ModuleFactory()
      moduleInstance = moduleFactory.getModule( self.voPlugin, argumentsDict )
      if not moduleInstance['OK']:
        self.log.error( 'Could not instantiate module:', '%s' % ( self.voPlugin ) )
        self.setFailedJob( job, 'Could not instantiate module: %s' % ( self.voPlugin ), classAdJob )
        return S_ERROR( 'Holding pending jobs' )

      module = moduleInstance['Value']
      result = module.execute()
      if not result['OK']:
        self.log.warn( 'Execution of %s failed' % ( self.voPlugin ) )
        return result
      extraPath = List.fromChar( result['Value'] )
      if extraPath:
        path.extend( extraPath )
        self.log.verbose( 'Adding extra VO specific optimizers to path: %s' % ( extraPath ) )
    else:
      self.log.verbose( 'No VO specific plugin module specified' )
      #Should only rely on an input data setting in absence of VO plugin
      result = self.jobDB.getInputData( job )
      if not result['OK']:
        self.log.error( 'Failed to get input data from JobDB', job )
        self.log.warn( result['Message'] )
        return result

      if result['Value']:
        # if the returned tuple is not empty it will evaluate true
        self.log.info( 'Job %s has an input data requirement' % ( job ) )
        path.extend( self.inputData )
      else:
        self.log.info( 'Job %s has no input data requirement' % ( job ) )

    path.extend( self.endPath )
    self.log.info( 'Constructed path for job %s is: %s' % ( job, path ) )
    return self.processJob( job, path )
  def __submitPilots( self, taskQueueDict, pilotsToSubmit ):
    """
      Try to insert the submission in the corresponding Thread Pool, disable the Thread Pool
      until next itration once it becomes full
    """
    # Check if an specific MiddleWare is required
    if 'SubmitPools' in taskQueueDict:
      submitPools = taskQueueDict[ 'SubmitPools' ]
    else:
      submitPools = self.am_getOption( 'DefaultSubmitPools' )
    submitPools = List.randomize( submitPools )

    for submitPool in submitPools:
      self.log.verbose( 'Trying SubmitPool:', submitPool )

      if not submitPool in self.directors or not self.directors[submitPool]['isEnabled']:
        self.log.verbose( 'Not Enabled' )
        continue

      pool = self.pools[self.directors[submitPool]['pool']]
      director = self.directors[submitPool]['director']
      ret = pool.generateJobAndQueueIt( director.submitPilots,
                                        args = ( taskQueueDict, pilotsToSubmit, self.workDir ),
                                        oCallback = self.callBack,
                                        oExceptionCallback = director.exceptionCallBack,
                                        blocking = False )
      if not ret['OK']:
        # Disable submission until next iteration
        self.directors[submitPool]['isEnabled'] = False
      else:
        time.sleep( self.am_getOption( 'ThreadStartDelay' ) )
        break

    return S_OK( pilotsToSubmit )
Example #4
0
  def _prepareJDL( self, taskQueueDict, workingDirectory, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ ):
    """
      Write JDL for Pilot Submission
    """
    # RB = List.randomize( self.resourceBrokers )[0]
    LDs = []
    NSs = []
    LBs = []
    # Select Randomly one RB from the list
    RB = List.randomize( self.resourceBrokers )[0]
    LDs.append( '"%s:9002"' % RB )
    LBs.append( '"%s:9000"' % RB )

    for LB in self.loggingServers:
      NSs.append( '"%s:7772"' % LB )

    LD = ', '.join( LDs )
    NS = ', '.join( NSs )
    LB = ', '.join( LBs )

    vo = getVO()
    if privateTQ or vo not in ['lhcb']:
      extraReq = "True"
    else:
      if submitPrivatePilot:
        extraReq = "! AllowsGenericPilot"
      else:
        extraReq = "AllowsGenericPilot"

    rbJDL = """
AllowsGenericPilot = Member( "VO-lhcb-pilot" , other.GlueHostApplicationSoftwareRunTimeEnvironment );
Requirements = pilotRequirements && other.GlueCEStateStatus == "Production" && %s;
RetryCount = 0;
ErrorStorage = "%s/pilotError";
OutputStorage = "%s/pilotOutput";
# ListenerPort = 44000;
ListenerStorage = "%s/Storage";
VirtualOrganisation = "lhcb";
LoggingTimeout = 30;
LoggingSyncTimeout = 30;
LoggingDestination = { %s };
# Default NS logger level is set to 0 (null)
# max value is 6 (very ugly)
NSLoggerLevel = 0;
DefaultLogInfoLevel = 0;
DefaultStatusLevel = 0;
NSAddresses = { %s };
LBAddresses = { %s };
MyProxyServer = "no-myproxy.cern.ch";
""" % ( extraReq, workingDirectory, workingDirectory, workingDirectory, LD, NS, LB )

    pilotJDL, pilotRequirements = self._JobJDL( taskQueueDict, pilotOptions, ceMask )

    jdl = os.path.join( workingDirectory, '%s.jdl' % taskQueueDict['TaskQueueID'] )
    jdl = self._writeJDL( jdl, [pilotJDL, rbJDL] )

    return {'JDL':jdl, 'Requirements':pilotRequirements + " && " + extraReq, 'Pilots': pilotsToSubmit, 'RB':RB }
Example #5
0
  def __getJobSiteRequirement( self, job, classAdJob ):
    """Returns any candidate sites specified by the job or sites that have been
       banned and could affect the scheduling decision.
    """

    result = self.jobDB.getJobAttribute( job, 'Site' )
    if not result['OK']:
      site = []
    else:
      site = List.fromChar( result['Value'] )

    result = S_OK()

    bannedSites = classAdJob.getAttributeString( 'BannedSite' )
    if not bannedSites:
      # Just try out the legacy option variant
      bannedSites = classAdJob.getAttributeString( 'BannedSites' )
    bannedSites = bannedSites.replace( '{', '' ).replace( '}', '' )
    bannedSites = List.fromChar( bannedSites )

    groupFlag = False
    for s in site:
      if "Group" in s:
        groupFlag = True

    if not 'ANY' in site and not 'Unknown' in site and not 'Multiple' in site and not groupFlag:
      if len( site ) == 1:
        self.log.info( 'Job %s has single chosen site %s specified in JDL' % ( job, site[0] ) )
      result['Sites'] = site
    elif 'Multiple' in site or groupFlag:
      result['Sites'] = classAdJob.getListFromExpression( 'Site' )
      # We might also be here after a Staging Request where several Sites are allowed
      if 'ANY' in result['Sites'] or '' in result['Sites']:
        result['Sites'] = []
    else:
      result['Sites'] = []

    if bannedSites:
      self.log.info( 'Job %s has JDL requirement to ban %s' % ( job, bannedSites ) )
      result['BannedSites'] = bannedSites
    else:
      result['BannedSites'] = []

    return result
Example #6
0
    def __getJobSiteRequirement(self, job, classAdJob):
        """Returns any candidate sites specified by the job or sites that have been
       banned and could affect the scheduling decision.
    """

        result = self.jobDB.getJobAttribute(job, "Site")
        if not result["OK"]:
            site = []
        else:
            site = List.fromChar(result["Value"])

        result = S_OK()

        bannedSites = classAdJob.getAttributeString("BannedSites")
        bannedSites = bannedSites.replace("{", "").replace("}", "")
        bannedSites = List.fromChar(bannedSites)

        groupFlag = False
        for s in site:
            if "Group" in s:
                groupFlag = True

        if not "ANY" in site and not "Unknown" in site and not "Multiple" in site and not groupFlag:
            if len(site) == 1:
                self.log.info("Job %s has single chosen site %s specified in JDL" % (job, site[0]))
            result["Sites"] = site
        elif "Multiple" in site or groupFlag:
            result["Sites"] = classAdJob.getListFromExpression("Site")
            # We might also be here after a Staging Request where several Sites are allowed
            if "ANY" in result["Sites"] or "" in result["Sites"]:
                result["Sites"] = []
        else:
            result["Sites"] = []

        if bannedSites:
            self.log.info("Job %s has JDL requirement to ban %s" % (job, bannedSites))
            result["BannedSites"] = bannedSites
        else:
            result["BannedSites"] = []

        return result
Example #7
0
  def __executeVOPlugin( self, voPlugin, jobState ):
    if voPlugin not in self.__voPlugins:
      modName = List.fromChar( voPlugin, "." )[-1]
      try:
        module = __import__( voPlugin, globals(), locals(), [ modName ] )
      except ImportError, excp:
        self.jobLog.exception( "Could not import VO plugin %s" % voPlugin )
        return S_ERROR( "Could not import VO plugin %s: %s" % ( voPlugin, excp ) )

      try:
        self.__voPlugins[ voPlugin ] = getattr( module, modName )
      except AttributeError, excp:
        return S_ERROR( "Could not get plugin %s from module %s: %s" % ( modName, voPlugin, str( excp ) ) )
Example #8
0
  def parseJobSubmitStdout( self, proxy, cmd, taskQueueID, rb ):
    """
      Parse Job Submit stdout to return pilot reference
    """
    start = time.time()
    self.log.verbose( 'Executing Job Submit for TaskQueue', taskQueueID )

    ret = executeGridCommand( proxy, cmd, self.gridEnv )

    if not ret['OK']:
      self.log.error( 'Failed to execute Job Submit:', ret['Message'] )
      self.__sendErrorMail( rb, 'Job Submit', cmd, ret, proxy )
      return False
    if ret['Value'][0] != 0:
      self.log.error( 'Error executing Job Submit:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
      self.__sendErrorMail( rb, 'Job Submit', cmd, ret, proxy )
      return False
    self.log.info( 'Job Submit Execution Time: %.2f for TaskQueue %d' % ( ( time.time() - start ), taskQueueID ) )

    stdout = ret['Value'][1]
    stderr = ret['Value'][2]

    submittedPilot = None

    failed = 1
    rb = ''
    for line in List.fromChar( stdout, '\n' ):
      m = re.search( "(https:\S+)", line )
      if ( m ):
        glite_id = m.group( 1 )
        submittedPilot = glite_id
        if not rb:
          m = re.search( "https://(.+):.+", glite_id )
          rb = m.group( 1 )
        failed = 0
    if failed:
      self.log.error( 'Job Submit returns no Reference:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
      return False

    self.log.info( 'Reference %s for TaskQueue %s' % ( glite_id, taskQueueID ) )

    return glite_id, rb
Example #9
0
    def parseJobSubmitStdout(self, proxy, cmd, taskQueueID, rb):
        """
      Parse Job Submit stdout to return pilot reference
    """
        start = time.time()
        self.log.verbose("Executing Job Submit for TaskQueue", taskQueueID)

        ret = executeGridCommand(proxy, cmd, self.gridEnv)

        if not ret["OK"]:
            self.log.error("Failed to execute Job Submit:", ret["Message"])
            self.__sendErrorMail(rb, "Job Submit", cmd, ret, proxy)
            return False
        if ret["Value"][0] != 0:
            self.log.error("Error executing Job Submit:", str(ret["Value"][0]) + "\n".join(ret["Value"][1:3]))
            self.__sendErrorMail(rb, "Job Submit", cmd, ret, proxy)
            return False
        self.log.info("Job Submit Execution Time: %.2f for TaskQueue %d" % ((time.time() - start), taskQueueID))

        stdout = ret["Value"][1]
        stderr = ret["Value"][2]

        submittedPilot = None

        failed = 1
        rb = ""
        for line in List.fromChar(stdout, "\n"):
            m = re.search("(https:\S+)", line)
            if m:
                glite_id = m.group(1)
                submittedPilot = glite_id
                if not rb:
                    m = re.search("https://(.+):.+", glite_id)
                    rb = m.group(1)
                failed = 0
        if failed:
            self.log.error("Job Submit returns no Reference:", str(ret["Value"][0]) + "\n".join(ret["Value"][1:3]))
            return False

        self.log.info("Reference %s for TaskQueue %s" % (glite_id, taskQueueID))

        return glite_id, rb
Example #10
0
  def configure( self, csSection, submitPool ):
    """
     Here goes common configuration for all Grid PilotDirectors
    """
    PilotDirector.configure( self, csSection, submitPool )
    self.reloadConfiguration( csSection, submitPool )

    self.__failingWMSCache.purgeExpired()
    self.__ticketsWMSCache.purgeExpired()
    for rb in self.__failingWMSCache.getKeys():
      if rb in self.resourceBrokers:
        try:
          self.resourceBrokers.remove( rb )
        except:
          pass

    self.resourceBrokers = List.randomize( self.resourceBrokers )

    if self.gridEnv:
      self.log.info( ' GridEnv:        ', self.gridEnv )
    if self.resourceBrokers:
      self.log.info( ' ResourceBrokers:', ', '.join( self.resourceBrokers ) )
Example #11
0
  def parseListMatchStdout( self, proxy, cmd, taskQueueID, rb ):
    """
      Parse List Match stdout to return list of matched CE's
    """
    self.log.verbose( 'Executing List Match for TaskQueue', taskQueueID )

    start = time.time()
    ret = executeGridCommand( proxy, cmd, self.gridEnv )

    if not ret['OK']:
      self.log.error( 'Failed to execute List Match:', ret['Message'] )
      self.__sendErrorMail( rb, 'List Match', cmd, ret, proxy )
      return False
    if ret['Value'][0] != 0:
      self.log.error( 'Error executing List Match:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
      self.__sendErrorMail( rb, 'List Match', cmd, ret, proxy )
      return False
    self.log.info( 'List Match Execution Time: %.2f for TaskQueue %d' % ( ( time.time() - start ), taskQueueID ) )

    stdout = ret['Value'][1]
    stderr = ret['Value'][2]
    availableCEs = []
    # Parse std.out
    for line in List.fromChar( stdout, '\n' ):
      if re.search( '/jobmanager-', line ) or re.search( '/cream-', line ):
        # TODO: the line has to be stripped from extra info
        availableCEs.append( line )

    if not availableCEs:
      self.log.info( 'List-Match failed to find CEs for TaskQueue', taskQueueID )
      self.log.info( stdout )
      self.log.info( stderr )
    else:
      self.log.debug( 'List-Match returns:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
      self.log.info( 'List-Match found %s CEs for TaskQueue' % len( availableCEs ), taskQueueID )
      self.log.verbose( ', '.join( availableCEs ) )


    return availableCEs
Example #12
0
    def _getChildrenReferences(self, proxy, parentReference, taskQueueID):
        """
     Get reference for all Children
    """
        cmd = ["glite-wms-job-status", parentReference]

        start = time.time()
        self.log.verbose("Executing Job Status for TaskQueue", taskQueueID)

        ret = executeGridCommand(proxy, cmd, self.gridEnv)

        if not ret["OK"]:
            self.log.error("Failed to execute Job Status", ret["Message"])
            return []
        if ret["Value"][0] != 0:
            self.log.error("Error executing Job Status:", str(ret["Value"][0]) + "\n".join(ret["Value"][1:3]))
            return []
        self.log.info("Job Status Execution Time: %.2f" % (time.time() - start))

        stdout = ret["Value"][1]
        # stderr = ret['Value'][2]

        references = []

        failed = 1
        for line in List.fromChar(stdout, "\n"):
            match = re.search("Status info for the Job : (https:\S+)", line)
            if match:
                glite_id = match.group(1)
                if glite_id not in references and glite_id != parentReference:
                    references.append(glite_id)
                failed = 0
        if failed:
            error = str(ret["Value"][0]) + "\n".join(ret["Value"][1:3])
            self.log.error("Job Status returns no Child Reference:", error)
            return [parentReference]

        return references
Example #13
0
    def parseListMatchStdout(self, proxy, cmd, taskQueueID, rb):
        """
      Parse List Match stdout to return list of matched CE's
    """
        self.log.verbose("Executing List Match for TaskQueue", taskQueueID)

        start = time.time()
        ret = executeGridCommand(proxy, cmd, self.gridEnv)

        if not ret["OK"]:
            self.log.error("Failed to execute List Match:", ret["Message"])
            self.__sendErrorMail(rb, "List Match", cmd, ret, proxy)
            return False
        if ret["Value"][0] != 0:
            self.log.error("Error executing List Match:", str(ret["Value"][0]) + "\n".join(ret["Value"][1:3]))
            self.__sendErrorMail(rb, "List Match", cmd, ret, proxy)
            return False
        self.log.info("List Match Execution Time: %.2f for TaskQueue %d" % ((time.time() - start), taskQueueID))

        stdout = ret["Value"][1]
        stderr = ret["Value"][2]
        availableCEs = []
        # Parse std.out
        for line in List.fromChar(stdout, "\n"):
            if re.search("/jobmanager-", line) or re.search("/cream-", line):
                # TODO: the line has to be stripped from extra info
                availableCEs.append(line)

        if not availableCEs:
            self.log.info("List-Match failed to find CEs for TaskQueue", taskQueueID)
            self.log.info(stdout)
            self.log.info(stderr)
        else:
            self.log.debug("List-Match returns:", str(ret["Value"][0]) + "\n".join(ret["Value"][1:3]))
            self.log.info("List-Match found %s CEs for TaskQueue" % len(availableCEs), taskQueueID)
            self.log.verbose(", ".join(availableCEs))

        return availableCEs
Example #14
0
  def _getChildrenReferences( self, proxy, parentReference, taskQueueID ):
    """
     Get reference for all Children
    """
    cmd = [ 'glite-wms-job-status', parentReference ]

    start = time.time()
    self.log.verbose( 'Executing Job Status for TaskQueue', taskQueueID )

    ret = executeGridCommand( proxy, cmd, self.gridEnv )

    if not ret['OK']:
      self.log.error( 'Failed to execute Job Status', ret['Message'] )
      return False
    if ret['Value'][0] != 0:
      self.log.error( 'Error executing Job Status:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
      return False
    self.log.info( 'Job Status Execution Time: %.2f' % ( time.time() - start ) )

    stdout = ret['Value'][1]
    # stderr = ret['Value'][2]

    references = []

    failed = 1
    for line in List.fromChar( stdout, '\n' ):
      match = re.search( "Status info for the Job : (https:\S+)", line )
      if ( match ):
        glite_id = match.group( 1 )
        if glite_id not in references and glite_id != parentReference:
          references.append( glite_id )
        failed = 0
    if failed:
      error = str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] )
      self.log.error( 'Job Status returns no Child Reference:', error )
      return [parentReference]

    return references
Example #15
0
    def __submitPilots(self, taskQueueDict, pilotsToSubmit):
        """
      Try to insert the submission in the corresponding Thread Pool, disable the Thread Pool
      until next itration once it becomes full
    """
        # Check if an specific MiddleWare is required
        if 'SubmitPools' in taskQueueDict:
            submitPools = taskQueueDict['SubmitPools']
        else:
            submitPools = self.am_getOption('DefaultSubmitPools')
        submitPools = List.randomize(submitPools)

        for submitPool in submitPools:
            self.log.verbose('Trying SubmitPool:', submitPool)

            if not submitPool in self.directors or not self.directors[
                    submitPool]['isEnabled']:
                self.log.verbose('Not Enabled')
                continue

            pool = self.pools[self.directors[submitPool]['pool']]
            director = self.directors[submitPool]['director']
            ret = pool.generateJobAndQueueIt(
                director.submitPilots,
                args=(taskQueueDict, pilotsToSubmit, self.workDir),
                oCallback=self.callBack,
                oExceptionCallback=director.exceptionCallBack,
                blocking=False)
            if not ret['OK']:
                # Disable submission until next iteration
                self.directors[submitPool]['isEnabled'] = False
            else:
                time.sleep(self.am_getOption('ThreadStartDelay'))
                break

        return S_OK(pilotsToSubmit)
Example #16
0
    argsDict = { 'JobID': jobState.jid,
                 'JobState' : jobState,
                 'ConfigPath':self.ex_getProperty( "section" ) }
    try:
      modInstance = self.__voPlugins[ voPlugin ]( argsDict )
      result = modInstance.execute()
    except Exception, excp:
      self.jobLog.exception( "Excp while executing %s" % voPlugin )
      return S_ERROR( "Could not execute VO plugin %s: %s" % ( voPlugin, excp ) )

    if not result['OK']:
      return result
    extraPath = result[ 'Value' ]
    if type( extraPath ) in types.StringTypes:
      extraPath = List.fromChar( result['Value'] )
    return S_OK( extraPath )


  def optimizeJob( self, jid, jobState ):
    result = jobState.getManifest()
    if not result[ 'OK' ]:
      return result
    jobManifest = result[ 'Value' ]
    opChain = jobManifest.getOption( "JobPath", [] )
    if opChain:
      self.jobLog.info( 'Job defines its own optimizer chain %s' % opChain )
      return self.__setOptimizerChain( jobState, opChain )
    #Construct path
    opPath = self.ex_getOption( 'BasePath', ['JobPath', 'JobSanity'] )
    voPlugin = self.ex_getOption( 'VOPlugin', '' )
Example #17
0
    def _prepareJDL(self, taskQueueDict, workingDirectory, pilotOptions,
                    pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ):
        """
      Write JDL for Pilot Submission
    """
        rbList = []
        # Select Randomly one RB from the list
        rb = List.randomize(self.resourceBrokers)[0]
        rbList.append('"https://%s:7443/glite_wms_wmproxy_server"' % rb)

        lbList = []
        for lb in self.loggingServers:
            lbList.append('"https://%s:9000"' % lb)
        lbList = List.randomize(lbList)

        nPilots = 1
        vo = gConfig.getValue('/DIRAC/VirtualOrganization', '')
        if privateTQ or vo not in ['lhcb']:
            extraReq = "True"
        else:
            if submitPrivatePilot:
                extraReq = "! AllowsGenericPilot"
            else:
                extraReq = "AllowsGenericPilot"

        wmsClientJDL = """

RetryCount = 0;
ShallowRetryCount = 0;
MyProxyServer = "%s";

AllowsGenericPilot = Member( "VO-lhcb-pilot" , other.GlueHostApplicationSoftwareRunTimeEnvironment );
Requirements = pilotRequirements && %s;
WmsClient = [
ErrorStorage = "%s/pilotError";
OutputStorage = "%s/pilotOutput";
# ListenerPort = 44000;
ListenerStorage = "%s/Storage";
RetryCount = 0;
ShallowRetryCount = 0;
WMProxyEndPoints = { %s };
LBEndPoints = { %s };
MyProxyServer = "%s";
EnableServiceDiscovery = false;
JdlDefaultAttributes =  [
    requirements  =  ( other.GlueCEStateStatus == "Production" || other.GlueCEStateStatus == "Special" );
    AllowZippedISB  =  true;
    SignificantAttributes  =  {"Requirements", "Rank", "FuzzyRank"};
    PerusalFileEnable  =  false;
    ];
];
""" % (self.myProxyServer, extraReq, workingDirectory, workingDirectory,
        workingDirectory, ', '.join(rbList), ', '.join(lbList),
        self.myProxyServer)

        if pilotsToSubmit > 1:
            wmsClientJDL += """
JobType = "Parametric";
Parameters= %s;
ParameterStep =1;
ParameterStart = 0;
""" % pilotsToSubmit
            nPilots = pilotsToSubmit

        (pilotJDL, pilotRequirements) = self._JobJDL(taskQueueDict,
                                                     pilotOptions, ceMask)

        jdl = os.path.join(workingDirectory,
                           '%s.jdl' % taskQueueDict['TaskQueueID'])
        jdl = self._writeJDL(jdl, [pilotJDL, wmsClientJDL])

        return {
            'JDL': jdl,
            'Requirements': pilotRequirements + " && " + extraReq,
            'Pilots': nPilots,
            'RB': rb
        }
Example #18
0
    def _prepareJDL(self, taskQueueDict, workingDirectory, pilotOptions,
                    pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ):
        """
      Write JDL for Pilot Submission
    """
        # RB = List.randomize( self.resourceBrokers )[0]
        LDs = []
        NSs = []
        LBs = []
        # Select Randomly one RB from the list
        RB = List.randomize(self.resourceBrokers)[0]
        LDs.append('"%s:9002"' % RB)
        LBs.append('"%s:9000"' % RB)

        for LB in self.loggingServers:
            NSs.append('"%s:7772"' % LB)

        LD = ', '.join(LDs)
        NS = ', '.join(NSs)
        LB = ', '.join(LBs)

        vo = getVO()
        if privateTQ or vo not in ['lhcb']:
            extraReq = "True"
        else:
            if submitPrivatePilot:
                extraReq = "! AllowsGenericPilot"
            else:
                extraReq = "AllowsGenericPilot"

        rbJDL = """
AllowsGenericPilot = Member( "VO-lhcb-pilot" , other.GlueHostApplicationSoftwareRunTimeEnvironment );
Requirements = pilotRequirements && other.GlueCEStateStatus == "Production" && %s;
RetryCount = 0;
ErrorStorage = "%s/pilotError";
OutputStorage = "%s/pilotOutput";
# ListenerPort = 44000;
ListenerStorage = "%s/Storage";
VirtualOrganisation = "lhcb";
LoggingTimeout = 30;
LoggingSyncTimeout = 30;
LoggingDestination = { %s };
# Default NS logger level is set to 0 (null)
# max value is 6 (very ugly)
NSLoggerLevel = 0;
DefaultLogInfoLevel = 0;
DefaultStatusLevel = 0;
NSAddresses = { %s };
LBAddresses = { %s };
MyProxyServer = "no-myproxy.cern.ch";
""" % (extraReq, workingDirectory, workingDirectory, workingDirectory, LD, NS,
        LB)

        pilotJDL, pilotRequirements = self._JobJDL(taskQueueDict, pilotOptions,
                                                   ceMask)

        jdl = os.path.join(workingDirectory,
                           '%s.jdl' % taskQueueDict['TaskQueueID'])
        jdl = self._writeJDL(jdl, [pilotJDL, rbJDL])

        return {
            'JDL': jdl,
            'Requirements': pilotRequirements + " && " + extraReq,
            'Pilots': pilotsToSubmit,
            'RB': RB
        }
Example #19
0
  def __parseJobStatus( self, job, gridType ):
    """ Parse output of grid pilot status command
    """

    statusRE = 'Current Status:\s*(\w*)'
    destinationRE = 'Destination:\s*([\w\.-]*)'
    statusDateLCGRE = 'reached on:\s*....(.*)'
    submittedDateRE = 'Submitted:\s*....(.*)'
    statusFailedRE = 'Current Status:.*\(Failed\)'

    status = None
    destination = 'Unknown'
    statusDate = None
    submittedDate = None

    try:
      status = re.search( statusRE, job ).group( 1 )
      if status == 'Done' and re.search( statusFailedRE, job ):
        status = 'Failed'
      if re.search( destinationRE, job ):
        destination = re.search( destinationRE, job ).group( 1 )
      if gridType == 'LCG' and re.search( statusDateLCGRE, job ):
        statusDate = re.search( statusDateLCGRE, job ).group( 1 )
        statusDate = time.strftime( '%Y-%m-%d %H:%M:%S', time.strptime( statusDate, '%b %d %H:%M:%S %Y' ) )
      if gridType == 'gLite' and re.search( submittedDateRE, job ):
        submittedDate = re.search( submittedDateRE, job ).group( 1 )
        submittedDate = time.strftime( '%Y-%m-%d %H:%M:%S', time.strptime( submittedDate, '%b %d %H:%M:%S %Y %Z' ) )
    except:
      self.log.exception( 'Error parsing %s Job Status output:\n' % gridType, job )

    isParent = False
    if re.search( 'Nodes information', job ):
      isParent = True
    isChild = False
    if re.search( 'Parent Job', job ):
      isChild = True

    if status == "Running":
      # Pilots can be in Running state for too long, due to bugs in the WMS
      if statusDate:
        statusTime = Time.fromString( statusDate )
        delta = Time.dateTime() - statusTime
        if delta > 4 * Time.day:
          self.log.info( 'Setting pilot status to Deleted after 4 days in Running' )
          status = "Deleted"
          statusDate = statusTime + 4 * Time.day
      elif submittedDate:
        statusTime = Time.fromString( submittedDate )
        delta = Time.dateTime() - statusTime
        if delta > 7 * Time.day:
          self.log.info( 'Setting pilot status to Deleted more than 7 days after submission still in Running' )
          status = "Deleted"
          statusDate = statusTime + 7 * Time.day

    childRefs = []
    childDicts = {}
    if isParent:
      for subjob in List.fromChar( job, ' Status info for the Job :' )[1:]:
        chRef = List.fromChar( subjob, '\n' )[0].strip()
        childDict = self.__parseJobStatus( subjob, gridType )
        childRefs.append( chRef )
        childDicts[chRef] = childDict

    return { 'Status': status,
             'DestinationSite': destination,
             'StatusDate': statusDate,
             'isChild': isChild,
             'isParent': isParent,
             'ParentRef': False,
             'FinalStatus' : status in self.finalStateList,
             'ChildRefs' : childRefs,
             'ChildDicts' : childDicts }
Example #20
0
  def getPilotStatus( self, proxy, gridType, pilotRefList ):
    """ Get GRID job status information using the job's owner proxy and
        GRID job IDs. Returns for each JobID its status in the GRID WMS and
        its destination CE as a tuple of 2 elements
    """

    if gridType == 'LCG':
      cmd = [ 'edg-job-status' ]
    elif gridType == 'gLite':
      cmd = [ 'glite-wms-job-status' ]
    else:
      return S_ERROR()
    cmd.extend( pilotRefList )

    start = time.time()
    ret = executeGridCommand( proxy, cmd, self.gridEnv )
    self.log.info( '%s Job Status Execution Time for %d jobs:' %
                   ( gridType, len( pilotRefList ) ), time.time() - start )

    if not ret['OK']:
      self.log.error( 'Failed to execute %s Job Status' % gridType, ret['Message'] )
      return S_ERROR()
    if ret['Value'][0] != 0:
      stderr = ret['Value'][2]
      stdout = ret['Value'][1]
      deleted = 0
      resultDict = {}
      status = 'Deleted'
      destination = 'Unknown'
      deletedJobDict = { 'Status': status,
             'DestinationSite': destination,
             'StatusDate': Time.dateTime(),
             'isChild': False,
             'isParent': False,
             'ParentRef': False,
             'FinalStatus' : status in self.finalStateList,
             'ChildRefs' : [] }
      # Glite returns this error for Deleted jobs to std.err
      for job in List.fromChar( stderr, '\nUnable to retrieve the status for:' )[1:]:
        pRef = List.fromChar( job, '\n' )[0].strip()
        resultDict[pRef] = deletedJobDict
        self.pilotDB.setPilotStatus( pRef, "Deleted" )
        deleted += 1
      # EDG returns a similar error for Deleted jobs to std.out
      for job in List.fromChar( stdout, '\nUnable to retrieve the status for:' )[1:]:
        pRef = List.fromChar( job, '\n' )[0].strip()
        if re.search( "No such file or directory: no matching jobs found", job ):
          resultDict[pRef] = deletedJobDict
          self.pilotDB.setPilotStatus( pRef, "Deleted" )
          deleted += 1
        if re.search( "edg_wll_JobStatus: Connection refused: edg_wll_ssl_connect()", job ):
          # the Broker is not accesible
          return S_ERROR( 'Broker not Available' )
      if not deleted:
        self.log.error( 'Error executing %s Job Status:' %
                        gridType, str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
        return S_ERROR()
      return S_OK( resultDict )

    stdout = ret['Value'][1]
    stderr = ret['Value'][2]
    resultDict = {}
    for job in List.fromChar( stdout, '\nStatus info for the Job :' )[1:]:
      pRef = List.fromChar( job, '\n' )[0].strip()
      resultDict[pRef] = self.__parseJobStatus( job, gridType )

    return S_OK( resultDict )
Example #21
0
  def _prepareJDL( self, taskQueueDict, workingDirectory, pilotOptions,
                   pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ ):
    """
      Write JDL for Pilot Submission
    """
    rbList = []
    # Select Randomly one RB from the list
    rb = List.randomize( self.resourceBrokers )[0]
    rbList.append( '"https://%s:7443/glite_wms_wmproxy_server"' % rb )

    lbList = []
    for lb in self.loggingServers:
      lbList.append( '"https://%s:9000"' % lb )
    lbList = List.randomize( lbList )

    nPilots = 1
    vo = gConfig.getValue( '/DIRAC/VirtualOrganization', '' )
    if privateTQ or vo not in ['lhcb']:
      extraReq = "True"
    else:
      if submitPrivatePilot:
        extraReq = "! AllowsGenericPilot"
      else:
        extraReq = "AllowsGenericPilot"

    wmsClientJDL = """

RetryCount = 0;
ShallowRetryCount = 0;
MyProxyServer = "%s";

AllowsGenericPilot = Member( "VO-lhcb-pilot" , other.GlueHostApplicationSoftwareRunTimeEnvironment );
Requirements = pilotRequirements && %s;
WmsClient = [
ErrorStorage = "%s/pilotError";
OutputStorage = "%s/pilotOutput";
# ListenerPort = 44000;
ListenerStorage = "%s/Storage";
RetryCount = 0;
ShallowRetryCount = 0;
WMProxyEndPoints = { %s };
LBEndPoints = { %s };
MyProxyServer = "%s";
EnableServiceDiscovery = false;
JdlDefaultAttributes =  [
    requirements  =  ( other.GlueCEStateStatus == "Production" || other.GlueCEStateStatus == "Special" );
    AllowZippedISB  =  true;
    SignificantAttributes  =  {"Requirements", "Rank", "FuzzyRank"};
    PerusalFileEnable  =  false;
    ];
];
""" % ( self.myProxyServer, extraReq,
        workingDirectory, workingDirectory,
        workingDirectory, ', '.join( rbList ),
        ', '.join( lbList ), self.myProxyServer )

    if pilotsToSubmit > 1:
      wmsClientJDL += """
JobType = "Parametric";
Parameters= %s;
ParameterStep =1;
ParameterStart = 0;
""" % pilotsToSubmit
      nPilots = pilotsToSubmit


    ( pilotJDL , pilotRequirements ) = self._JobJDL( taskQueueDict, pilotOptions, ceMask )

    jdl = os.path.join( workingDirectory, '%s.jdl' % taskQueueDict['TaskQueueID'] )
    jdl = self._writeJDL( jdl, [pilotJDL, wmsClientJDL] )

    return {'JDL':jdl, 'Requirements':pilotRequirements + " && " + extraReq, 'Pilots':nPilots, 'RB':rb }
Example #22
0
            'JobState': jobState,
            'ConfigPath': self.ex_getProperty("section")
        }
        try:
            modInstance = self.__voPlugins[voPlugin](argsDict)
            result = modInstance.execute()
        except Exception, excp:
            self.jobLog.exception("Excp while executing %s" % voPlugin)
            return S_ERROR("Could not execute VO plugin %s: %s" %
                           (voPlugin, excp))

        if not result['OK']:
            return result
        extraPath = result['Value']
        if type(extraPath) in types.StringTypes:
            extraPath = List.fromChar(result['Value'])
        return S_OK(extraPath)

    def optimizeJob(self, jid, jobState):
        result = jobState.getManifest()
        if not result['OK']:
            return result
        jobManifest = result['Value']
        opChain = jobManifest.getOption("JobPath", [])
        if opChain:
            self.jobLog.info('Job defines its own optimizer chain %s' %
                             opChain)
            return self.__setOptimizerChain(jobState, opChain)
        #Construct path
        opPath = self.ex_getOption('BasePath', ['JobPath', 'JobSanity'])
        voPlugin = self.ex_getOption('VOPlugin', '')
Example #23
0
    def checkJob(self, job, classAdJob):
        """This method controls the checking of the job.
    """
        jobDesc = JobDescription()
        result = jobDesc.loadDescription(classAdJob.asJDL())
        if not result['OK']:
            self.setFailedJob(job, result['Message'], classAdJob)
            return result
        self.__syncJobDesc(job, jobDesc, classAdJob)

        #Check if job defines a path itself
        # FIXME: only some group might be able to overwrite the jobPath
        jobPath = classAdJob.get_expression('JobPath').replace(
            '"', '').replace('Unknown', '')
        #jobPath = jobDesc.getVarWithDefault( 'JobPath' ).replace( 'Unknown', '' )
        if jobPath:
            # HACK: Remove the { and } to ensure we have a simple string
            jobPath = jobPath.replace("{", "").replace("}", "")
            self.log.info('Job %s defines its own optimizer chain %s' %
                          (job, jobPath))
            return self.processJob(job, List.fromChar(jobPath))

        #If no path, construct based on JDL and VO path module if present
        path = list(self.basePath)
        if self.voPlugin:
            argumentsDict = {
                'JobID': job,
                'ClassAd': classAdJob,
                'ConfigPath': self.am_getModuleParam("section")
            }
            moduleFactory = ModuleFactory()
            moduleInstance = moduleFactory.getModule(self.voPlugin,
                                                     argumentsDict)
            if not moduleInstance['OK']:
                self.log.error('Could not instantiate module:',
                               '%s' % (self.voPlugin))
                self.setFailedJob(
                    job, 'Could not instantiate module: %s' % (self.voPlugin),
                    classAdJob)
                return S_ERROR('Holding pending jobs')

            module = moduleInstance['Value']
            result = module.execute()
            if not result['OK']:
                self.log.warn('Execution of %s failed' % (self.voPlugin))
                return result
            extraPath = List.fromChar(result['Value'])
            if extraPath:
                path.extend(extraPath)
                self.log.verbose(
                    'Adding extra VO specific optimizers to path: %s' %
                    (extraPath))
        else:
            self.log.verbose('No VO specific plugin module specified')
            #Should only rely on an input data setting in absence of VO plugin
            result = self.jobDB.getInputData(job)
            if not result['OK']:
                self.log.error('Failed to get input data from JobDB', job)
                self.log.warn(result['Message'])
                return result

            if result['Value']:
                # if the returned tuple is not empty it will evaluate true
                self.log.info('Job %s has an input data requirement' % (job))
                path.extend(self.inputData)
            else:
                self.log.info('Job %s has no input data requirement' % (job))

        path.extend(self.endPath)
        self.log.info('Constructed path for job %s is: %s' % (job, path))
        return self.processJob(job, path)