def __getJobSiteRequirement( self, job, classAdJob ): """Returns any candidate sites specified by the job or sites that have been banned and could affect the scheduling decision. """ result = self.jobDB.getJobAttribute( job, 'Site' ) if not result['OK']: site = [] else: site = List.fromChar( result['Value'] ) result = S_OK() bannedSites = classAdJob.getAttributeString( 'BannedSites' ) bannedSites = bannedSites.replace( '{', '' ).replace( '}', '' ) bannedSites = List.fromChar( bannedSites ) if not 'ANY' in site and not 'Unknown' in site and not 'Multiple' in site: if len( site ) == 1: self.log.info( 'Job %s has single chosen site %s specified in JDL' % ( job, site[0] ) ) result['Sites'] = site elif 'Multiple' in site: result['Sites'] = classAdJob.getListFromExpression( 'Site' ) else: result['Sites'] = [] if bannedSites: self.log.info( 'Job %s has JDL requirement to ban %s' % ( job, bannedSites ) ) result['BannedSites'] = bannedSites else: result['BannedSites'] = [] return result
def checkJob( self, job, classAdJob ): """This method controls the checking of the job. """ jobDesc = JobDescription() result = jobDesc.loadDescription( classAdJob.asJDL() ) if not result[ 'OK' ]: self.setFailedJob( job, result['Message'], classAdJob ) return result self.__syncJobDesc( job, jobDesc, classAdJob ) #Check if job defines a path itself # FIXME: only some group might be able to overwrite the jobPath jobPath = classAdJob.get_expression( 'JobPath' ).replace( '"', '' ).replace( 'Unknown', '' ) #jobPath = jobDesc.getVarWithDefault( 'JobPath' ).replace( 'Unknown', '' ) if jobPath: # HACK: Remove the { and } to ensure we have a simple string jobPath = jobPath.replace( "{", "" ).replace( "}", "" ) self.log.info( 'Job %s defines its own optimizer chain %s' % ( job, jobPath ) ) return self.processJob( job, List.fromChar( jobPath ) ) #If no path, construct based on JDL and VO path module if present path = list( self.basePath ) if self.voPlugin: argumentsDict = {'JobID':job, 'ClassAd':classAdJob, 'ConfigPath':self.am_getModuleParam( "section" )} moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule( self.voPlugin, argumentsDict ) if not moduleInstance['OK']: self.log.error( 'Could not instantiate module:', '%s' % ( self.voPlugin ) ) self.setFailedJob( job, 'Could not instantiate module: %s' % ( self.voPlugin ), classAdJob ) return S_ERROR( 'Holding pending jobs' ) module = moduleInstance['Value'] result = module.execute() if not result['OK']: self.log.warn( 'Execution of %s failed' % ( self.voPlugin ) ) return result extraPath = List.fromChar( result['Value'] ) if extraPath: path.extend( extraPath ) self.log.verbose( 'Adding extra VO specific optimizers to path: %s' % ( extraPath ) ) else: self.log.verbose( 'No VO specific plugin module specified' ) #Should only rely on an input data setting in absence of VO plugin result = self.jobDB.getInputData( job ) if not result['OK']: self.log.error( 'Failed to get input data from JobDB', job ) self.log.warn( result['Message'] ) return result if result['Value']: # if the returned tuple is not empty it will evaluate true self.log.info( 'Job %s has an input data requirement' % ( job ) ) path.extend( self.inputData ) else: self.log.info( 'Job %s has no input data requirement' % ( job ) ) path.extend( self.endPath ) self.log.info( 'Constructed path for job %s is: %s' % ( job, path ) ) return self.processJob( job, path )
def __submitPilots( self, taskQueueDict, pilotsToSubmit ): """ Try to insert the submission in the corresponding Thread Pool, disable the Thread Pool until next itration once it becomes full """ # Check if an specific MiddleWare is required if 'SubmitPools' in taskQueueDict: submitPools = taskQueueDict[ 'SubmitPools' ] else: submitPools = self.am_getOption( 'DefaultSubmitPools' ) submitPools = List.randomize( submitPools ) for submitPool in submitPools: self.log.verbose( 'Trying SubmitPool:', submitPool ) if not submitPool in self.directors or not self.directors[submitPool]['isEnabled']: self.log.verbose( 'Not Enabled' ) continue pool = self.pools[self.directors[submitPool]['pool']] director = self.directors[submitPool]['director'] ret = pool.generateJobAndQueueIt( director.submitPilots, args = ( taskQueueDict, pilotsToSubmit, self.workDir ), oCallback = self.callBack, oExceptionCallback = director.exceptionCallBack, blocking = False ) if not ret['OK']: # Disable submission until next iteration self.directors[submitPool]['isEnabled'] = False else: time.sleep( self.am_getOption( 'ThreadStartDelay' ) ) break return S_OK( pilotsToSubmit )
def _prepareJDL( self, taskQueueDict, workingDirectory, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ ): """ Write JDL for Pilot Submission """ # RB = List.randomize( self.resourceBrokers )[0] LDs = [] NSs = [] LBs = [] # Select Randomly one RB from the list RB = List.randomize( self.resourceBrokers )[0] LDs.append( '"%s:9002"' % RB ) LBs.append( '"%s:9000"' % RB ) for LB in self.loggingServers: NSs.append( '"%s:7772"' % LB ) LD = ', '.join( LDs ) NS = ', '.join( NSs ) LB = ', '.join( LBs ) vo = getVO() if privateTQ or vo not in ['lhcb']: extraReq = "True" else: if submitPrivatePilot: extraReq = "! AllowsGenericPilot" else: extraReq = "AllowsGenericPilot" rbJDL = """ AllowsGenericPilot = Member( "VO-lhcb-pilot" , other.GlueHostApplicationSoftwareRunTimeEnvironment ); Requirements = pilotRequirements && other.GlueCEStateStatus == "Production" && %s; RetryCount = 0; ErrorStorage = "%s/pilotError"; OutputStorage = "%s/pilotOutput"; # ListenerPort = 44000; ListenerStorage = "%s/Storage"; VirtualOrganisation = "lhcb"; LoggingTimeout = 30; LoggingSyncTimeout = 30; LoggingDestination = { %s }; # Default NS logger level is set to 0 (null) # max value is 6 (very ugly) NSLoggerLevel = 0; DefaultLogInfoLevel = 0; DefaultStatusLevel = 0; NSAddresses = { %s }; LBAddresses = { %s }; MyProxyServer = "no-myproxy.cern.ch"; """ % ( extraReq, workingDirectory, workingDirectory, workingDirectory, LD, NS, LB ) pilotJDL, pilotRequirements = self._JobJDL( taskQueueDict, pilotOptions, ceMask ) jdl = os.path.join( workingDirectory, '%s.jdl' % taskQueueDict['TaskQueueID'] ) jdl = self._writeJDL( jdl, [pilotJDL, rbJDL] ) return {'JDL':jdl, 'Requirements':pilotRequirements + " && " + extraReq, 'Pilots': pilotsToSubmit, 'RB':RB }
def __getJobSiteRequirement( self, job, classAdJob ): """Returns any candidate sites specified by the job or sites that have been banned and could affect the scheduling decision. """ result = self.jobDB.getJobAttribute( job, 'Site' ) if not result['OK']: site = [] else: site = List.fromChar( result['Value'] ) result = S_OK() bannedSites = classAdJob.getAttributeString( 'BannedSite' ) if not bannedSites: # Just try out the legacy option variant bannedSites = classAdJob.getAttributeString( 'BannedSites' ) bannedSites = bannedSites.replace( '{', '' ).replace( '}', '' ) bannedSites = List.fromChar( bannedSites ) groupFlag = False for s in site: if "Group" in s: groupFlag = True if not 'ANY' in site and not 'Unknown' in site and not 'Multiple' in site and not groupFlag: if len( site ) == 1: self.log.info( 'Job %s has single chosen site %s specified in JDL' % ( job, site[0] ) ) result['Sites'] = site elif 'Multiple' in site or groupFlag: result['Sites'] = classAdJob.getListFromExpression( 'Site' ) # We might also be here after a Staging Request where several Sites are allowed if 'ANY' in result['Sites'] or '' in result['Sites']: result['Sites'] = [] else: result['Sites'] = [] if bannedSites: self.log.info( 'Job %s has JDL requirement to ban %s' % ( job, bannedSites ) ) result['BannedSites'] = bannedSites else: result['BannedSites'] = [] return result
def __getJobSiteRequirement(self, job, classAdJob): """Returns any candidate sites specified by the job or sites that have been banned and could affect the scheduling decision. """ result = self.jobDB.getJobAttribute(job, "Site") if not result["OK"]: site = [] else: site = List.fromChar(result["Value"]) result = S_OK() bannedSites = classAdJob.getAttributeString("BannedSites") bannedSites = bannedSites.replace("{", "").replace("}", "") bannedSites = List.fromChar(bannedSites) groupFlag = False for s in site: if "Group" in s: groupFlag = True if not "ANY" in site and not "Unknown" in site and not "Multiple" in site and not groupFlag: if len(site) == 1: self.log.info("Job %s has single chosen site %s specified in JDL" % (job, site[0])) result["Sites"] = site elif "Multiple" in site or groupFlag: result["Sites"] = classAdJob.getListFromExpression("Site") # We might also be here after a Staging Request where several Sites are allowed if "ANY" in result["Sites"] or "" in result["Sites"]: result["Sites"] = [] else: result["Sites"] = [] if bannedSites: self.log.info("Job %s has JDL requirement to ban %s" % (job, bannedSites)) result["BannedSites"] = bannedSites else: result["BannedSites"] = [] return result
def __executeVOPlugin( self, voPlugin, jobState ): if voPlugin not in self.__voPlugins: modName = List.fromChar( voPlugin, "." )[-1] try: module = __import__( voPlugin, globals(), locals(), [ modName ] ) except ImportError, excp: self.jobLog.exception( "Could not import VO plugin %s" % voPlugin ) return S_ERROR( "Could not import VO plugin %s: %s" % ( voPlugin, excp ) ) try: self.__voPlugins[ voPlugin ] = getattr( module, modName ) except AttributeError, excp: return S_ERROR( "Could not get plugin %s from module %s: %s" % ( modName, voPlugin, str( excp ) ) )
def parseJobSubmitStdout( self, proxy, cmd, taskQueueID, rb ): """ Parse Job Submit stdout to return pilot reference """ start = time.time() self.log.verbose( 'Executing Job Submit for TaskQueue', taskQueueID ) ret = executeGridCommand( proxy, cmd, self.gridEnv ) if not ret['OK']: self.log.error( 'Failed to execute Job Submit:', ret['Message'] ) self.__sendErrorMail( rb, 'Job Submit', cmd, ret, proxy ) return False if ret['Value'][0] != 0: self.log.error( 'Error executing Job Submit:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) ) self.__sendErrorMail( rb, 'Job Submit', cmd, ret, proxy ) return False self.log.info( 'Job Submit Execution Time: %.2f for TaskQueue %d' % ( ( time.time() - start ), taskQueueID ) ) stdout = ret['Value'][1] stderr = ret['Value'][2] submittedPilot = None failed = 1 rb = '' for line in List.fromChar( stdout, '\n' ): m = re.search( "(https:\S+)", line ) if ( m ): glite_id = m.group( 1 ) submittedPilot = glite_id if not rb: m = re.search( "https://(.+):.+", glite_id ) rb = m.group( 1 ) failed = 0 if failed: self.log.error( 'Job Submit returns no Reference:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) ) return False self.log.info( 'Reference %s for TaskQueue %s' % ( glite_id, taskQueueID ) ) return glite_id, rb
def parseJobSubmitStdout(self, proxy, cmd, taskQueueID, rb): """ Parse Job Submit stdout to return pilot reference """ start = time.time() self.log.verbose("Executing Job Submit for TaskQueue", taskQueueID) ret = executeGridCommand(proxy, cmd, self.gridEnv) if not ret["OK"]: self.log.error("Failed to execute Job Submit:", ret["Message"]) self.__sendErrorMail(rb, "Job Submit", cmd, ret, proxy) return False if ret["Value"][0] != 0: self.log.error("Error executing Job Submit:", str(ret["Value"][0]) + "\n".join(ret["Value"][1:3])) self.__sendErrorMail(rb, "Job Submit", cmd, ret, proxy) return False self.log.info("Job Submit Execution Time: %.2f for TaskQueue %d" % ((time.time() - start), taskQueueID)) stdout = ret["Value"][1] stderr = ret["Value"][2] submittedPilot = None failed = 1 rb = "" for line in List.fromChar(stdout, "\n"): m = re.search("(https:\S+)", line) if m: glite_id = m.group(1) submittedPilot = glite_id if not rb: m = re.search("https://(.+):.+", glite_id) rb = m.group(1) failed = 0 if failed: self.log.error("Job Submit returns no Reference:", str(ret["Value"][0]) + "\n".join(ret["Value"][1:3])) return False self.log.info("Reference %s for TaskQueue %s" % (glite_id, taskQueueID)) return glite_id, rb
def configure( self, csSection, submitPool ): """ Here goes common configuration for all Grid PilotDirectors """ PilotDirector.configure( self, csSection, submitPool ) self.reloadConfiguration( csSection, submitPool ) self.__failingWMSCache.purgeExpired() self.__ticketsWMSCache.purgeExpired() for rb in self.__failingWMSCache.getKeys(): if rb in self.resourceBrokers: try: self.resourceBrokers.remove( rb ) except: pass self.resourceBrokers = List.randomize( self.resourceBrokers ) if self.gridEnv: self.log.info( ' GridEnv: ', self.gridEnv ) if self.resourceBrokers: self.log.info( ' ResourceBrokers:', ', '.join( self.resourceBrokers ) )
def parseListMatchStdout( self, proxy, cmd, taskQueueID, rb ): """ Parse List Match stdout to return list of matched CE's """ self.log.verbose( 'Executing List Match for TaskQueue', taskQueueID ) start = time.time() ret = executeGridCommand( proxy, cmd, self.gridEnv ) if not ret['OK']: self.log.error( 'Failed to execute List Match:', ret['Message'] ) self.__sendErrorMail( rb, 'List Match', cmd, ret, proxy ) return False if ret['Value'][0] != 0: self.log.error( 'Error executing List Match:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) ) self.__sendErrorMail( rb, 'List Match', cmd, ret, proxy ) return False self.log.info( 'List Match Execution Time: %.2f for TaskQueue %d' % ( ( time.time() - start ), taskQueueID ) ) stdout = ret['Value'][1] stderr = ret['Value'][2] availableCEs = [] # Parse std.out for line in List.fromChar( stdout, '\n' ): if re.search( '/jobmanager-', line ) or re.search( '/cream-', line ): # TODO: the line has to be stripped from extra info availableCEs.append( line ) if not availableCEs: self.log.info( 'List-Match failed to find CEs for TaskQueue', taskQueueID ) self.log.info( stdout ) self.log.info( stderr ) else: self.log.debug( 'List-Match returns:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) ) self.log.info( 'List-Match found %s CEs for TaskQueue' % len( availableCEs ), taskQueueID ) self.log.verbose( ', '.join( availableCEs ) ) return availableCEs
def _getChildrenReferences(self, proxy, parentReference, taskQueueID): """ Get reference for all Children """ cmd = ["glite-wms-job-status", parentReference] start = time.time() self.log.verbose("Executing Job Status for TaskQueue", taskQueueID) ret = executeGridCommand(proxy, cmd, self.gridEnv) if not ret["OK"]: self.log.error("Failed to execute Job Status", ret["Message"]) return [] if ret["Value"][0] != 0: self.log.error("Error executing Job Status:", str(ret["Value"][0]) + "\n".join(ret["Value"][1:3])) return [] self.log.info("Job Status Execution Time: %.2f" % (time.time() - start)) stdout = ret["Value"][1] # stderr = ret['Value'][2] references = [] failed = 1 for line in List.fromChar(stdout, "\n"): match = re.search("Status info for the Job : (https:\S+)", line) if match: glite_id = match.group(1) if glite_id not in references and glite_id != parentReference: references.append(glite_id) failed = 0 if failed: error = str(ret["Value"][0]) + "\n".join(ret["Value"][1:3]) self.log.error("Job Status returns no Child Reference:", error) return [parentReference] return references
def parseListMatchStdout(self, proxy, cmd, taskQueueID, rb): """ Parse List Match stdout to return list of matched CE's """ self.log.verbose("Executing List Match for TaskQueue", taskQueueID) start = time.time() ret = executeGridCommand(proxy, cmd, self.gridEnv) if not ret["OK"]: self.log.error("Failed to execute List Match:", ret["Message"]) self.__sendErrorMail(rb, "List Match", cmd, ret, proxy) return False if ret["Value"][0] != 0: self.log.error("Error executing List Match:", str(ret["Value"][0]) + "\n".join(ret["Value"][1:3])) self.__sendErrorMail(rb, "List Match", cmd, ret, proxy) return False self.log.info("List Match Execution Time: %.2f for TaskQueue %d" % ((time.time() - start), taskQueueID)) stdout = ret["Value"][1] stderr = ret["Value"][2] availableCEs = [] # Parse std.out for line in List.fromChar(stdout, "\n"): if re.search("/jobmanager-", line) or re.search("/cream-", line): # TODO: the line has to be stripped from extra info availableCEs.append(line) if not availableCEs: self.log.info("List-Match failed to find CEs for TaskQueue", taskQueueID) self.log.info(stdout) self.log.info(stderr) else: self.log.debug("List-Match returns:", str(ret["Value"][0]) + "\n".join(ret["Value"][1:3])) self.log.info("List-Match found %s CEs for TaskQueue" % len(availableCEs), taskQueueID) self.log.verbose(", ".join(availableCEs)) return availableCEs
def _getChildrenReferences( self, proxy, parentReference, taskQueueID ): """ Get reference for all Children """ cmd = [ 'glite-wms-job-status', parentReference ] start = time.time() self.log.verbose( 'Executing Job Status for TaskQueue', taskQueueID ) ret = executeGridCommand( proxy, cmd, self.gridEnv ) if not ret['OK']: self.log.error( 'Failed to execute Job Status', ret['Message'] ) return False if ret['Value'][0] != 0: self.log.error( 'Error executing Job Status:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) ) return False self.log.info( 'Job Status Execution Time: %.2f' % ( time.time() - start ) ) stdout = ret['Value'][1] # stderr = ret['Value'][2] references = [] failed = 1 for line in List.fromChar( stdout, '\n' ): match = re.search( "Status info for the Job : (https:\S+)", line ) if ( match ): glite_id = match.group( 1 ) if glite_id not in references and glite_id != parentReference: references.append( glite_id ) failed = 0 if failed: error = str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) self.log.error( 'Job Status returns no Child Reference:', error ) return [parentReference] return references
def __submitPilots(self, taskQueueDict, pilotsToSubmit): """ Try to insert the submission in the corresponding Thread Pool, disable the Thread Pool until next itration once it becomes full """ # Check if an specific MiddleWare is required if 'SubmitPools' in taskQueueDict: submitPools = taskQueueDict['SubmitPools'] else: submitPools = self.am_getOption('DefaultSubmitPools') submitPools = List.randomize(submitPools) for submitPool in submitPools: self.log.verbose('Trying SubmitPool:', submitPool) if not submitPool in self.directors or not self.directors[ submitPool]['isEnabled']: self.log.verbose('Not Enabled') continue pool = self.pools[self.directors[submitPool]['pool']] director = self.directors[submitPool]['director'] ret = pool.generateJobAndQueueIt( director.submitPilots, args=(taskQueueDict, pilotsToSubmit, self.workDir), oCallback=self.callBack, oExceptionCallback=director.exceptionCallBack, blocking=False) if not ret['OK']: # Disable submission until next iteration self.directors[submitPool]['isEnabled'] = False else: time.sleep(self.am_getOption('ThreadStartDelay')) break return S_OK(pilotsToSubmit)
argsDict = { 'JobID': jobState.jid, 'JobState' : jobState, 'ConfigPath':self.ex_getProperty( "section" ) } try: modInstance = self.__voPlugins[ voPlugin ]( argsDict ) result = modInstance.execute() except Exception, excp: self.jobLog.exception( "Excp while executing %s" % voPlugin ) return S_ERROR( "Could not execute VO plugin %s: %s" % ( voPlugin, excp ) ) if not result['OK']: return result extraPath = result[ 'Value' ] if type( extraPath ) in types.StringTypes: extraPath = List.fromChar( result['Value'] ) return S_OK( extraPath ) def optimizeJob( self, jid, jobState ): result = jobState.getManifest() if not result[ 'OK' ]: return result jobManifest = result[ 'Value' ] opChain = jobManifest.getOption( "JobPath", [] ) if opChain: self.jobLog.info( 'Job defines its own optimizer chain %s' % opChain ) return self.__setOptimizerChain( jobState, opChain ) #Construct path opPath = self.ex_getOption( 'BasePath', ['JobPath', 'JobSanity'] ) voPlugin = self.ex_getOption( 'VOPlugin', '' )
def _prepareJDL(self, taskQueueDict, workingDirectory, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ): """ Write JDL for Pilot Submission """ rbList = [] # Select Randomly one RB from the list rb = List.randomize(self.resourceBrokers)[0] rbList.append('"https://%s:7443/glite_wms_wmproxy_server"' % rb) lbList = [] for lb in self.loggingServers: lbList.append('"https://%s:9000"' % lb) lbList = List.randomize(lbList) nPilots = 1 vo = gConfig.getValue('/DIRAC/VirtualOrganization', '') if privateTQ or vo not in ['lhcb']: extraReq = "True" else: if submitPrivatePilot: extraReq = "! AllowsGenericPilot" else: extraReq = "AllowsGenericPilot" wmsClientJDL = """ RetryCount = 0; ShallowRetryCount = 0; MyProxyServer = "%s"; AllowsGenericPilot = Member( "VO-lhcb-pilot" , other.GlueHostApplicationSoftwareRunTimeEnvironment ); Requirements = pilotRequirements && %s; WmsClient = [ ErrorStorage = "%s/pilotError"; OutputStorage = "%s/pilotOutput"; # ListenerPort = 44000; ListenerStorage = "%s/Storage"; RetryCount = 0; ShallowRetryCount = 0; WMProxyEndPoints = { %s }; LBEndPoints = { %s }; MyProxyServer = "%s"; EnableServiceDiscovery = false; JdlDefaultAttributes = [ requirements = ( other.GlueCEStateStatus == "Production" || other.GlueCEStateStatus == "Special" ); AllowZippedISB = true; SignificantAttributes = {"Requirements", "Rank", "FuzzyRank"}; PerusalFileEnable = false; ]; ]; """ % (self.myProxyServer, extraReq, workingDirectory, workingDirectory, workingDirectory, ', '.join(rbList), ', '.join(lbList), self.myProxyServer) if pilotsToSubmit > 1: wmsClientJDL += """ JobType = "Parametric"; Parameters= %s; ParameterStep =1; ParameterStart = 0; """ % pilotsToSubmit nPilots = pilotsToSubmit (pilotJDL, pilotRequirements) = self._JobJDL(taskQueueDict, pilotOptions, ceMask) jdl = os.path.join(workingDirectory, '%s.jdl' % taskQueueDict['TaskQueueID']) jdl = self._writeJDL(jdl, [pilotJDL, wmsClientJDL]) return { 'JDL': jdl, 'Requirements': pilotRequirements + " && " + extraReq, 'Pilots': nPilots, 'RB': rb }
def _prepareJDL(self, taskQueueDict, workingDirectory, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ): """ Write JDL for Pilot Submission """ # RB = List.randomize( self.resourceBrokers )[0] LDs = [] NSs = [] LBs = [] # Select Randomly one RB from the list RB = List.randomize(self.resourceBrokers)[0] LDs.append('"%s:9002"' % RB) LBs.append('"%s:9000"' % RB) for LB in self.loggingServers: NSs.append('"%s:7772"' % LB) LD = ', '.join(LDs) NS = ', '.join(NSs) LB = ', '.join(LBs) vo = getVO() if privateTQ or vo not in ['lhcb']: extraReq = "True" else: if submitPrivatePilot: extraReq = "! AllowsGenericPilot" else: extraReq = "AllowsGenericPilot" rbJDL = """ AllowsGenericPilot = Member( "VO-lhcb-pilot" , other.GlueHostApplicationSoftwareRunTimeEnvironment ); Requirements = pilotRequirements && other.GlueCEStateStatus == "Production" && %s; RetryCount = 0; ErrorStorage = "%s/pilotError"; OutputStorage = "%s/pilotOutput"; # ListenerPort = 44000; ListenerStorage = "%s/Storage"; VirtualOrganisation = "lhcb"; LoggingTimeout = 30; LoggingSyncTimeout = 30; LoggingDestination = { %s }; # Default NS logger level is set to 0 (null) # max value is 6 (very ugly) NSLoggerLevel = 0; DefaultLogInfoLevel = 0; DefaultStatusLevel = 0; NSAddresses = { %s }; LBAddresses = { %s }; MyProxyServer = "no-myproxy.cern.ch"; """ % (extraReq, workingDirectory, workingDirectory, workingDirectory, LD, NS, LB) pilotJDL, pilotRequirements = self._JobJDL(taskQueueDict, pilotOptions, ceMask) jdl = os.path.join(workingDirectory, '%s.jdl' % taskQueueDict['TaskQueueID']) jdl = self._writeJDL(jdl, [pilotJDL, rbJDL]) return { 'JDL': jdl, 'Requirements': pilotRequirements + " && " + extraReq, 'Pilots': pilotsToSubmit, 'RB': RB }
def __parseJobStatus( self, job, gridType ): """ Parse output of grid pilot status command """ statusRE = 'Current Status:\s*(\w*)' destinationRE = 'Destination:\s*([\w\.-]*)' statusDateLCGRE = 'reached on:\s*....(.*)' submittedDateRE = 'Submitted:\s*....(.*)' statusFailedRE = 'Current Status:.*\(Failed\)' status = None destination = 'Unknown' statusDate = None submittedDate = None try: status = re.search( statusRE, job ).group( 1 ) if status == 'Done' and re.search( statusFailedRE, job ): status = 'Failed' if re.search( destinationRE, job ): destination = re.search( destinationRE, job ).group( 1 ) if gridType == 'LCG' and re.search( statusDateLCGRE, job ): statusDate = re.search( statusDateLCGRE, job ).group( 1 ) statusDate = time.strftime( '%Y-%m-%d %H:%M:%S', time.strptime( statusDate, '%b %d %H:%M:%S %Y' ) ) if gridType == 'gLite' and re.search( submittedDateRE, job ): submittedDate = re.search( submittedDateRE, job ).group( 1 ) submittedDate = time.strftime( '%Y-%m-%d %H:%M:%S', time.strptime( submittedDate, '%b %d %H:%M:%S %Y %Z' ) ) except: self.log.exception( 'Error parsing %s Job Status output:\n' % gridType, job ) isParent = False if re.search( 'Nodes information', job ): isParent = True isChild = False if re.search( 'Parent Job', job ): isChild = True if status == "Running": # Pilots can be in Running state for too long, due to bugs in the WMS if statusDate: statusTime = Time.fromString( statusDate ) delta = Time.dateTime() - statusTime if delta > 4 * Time.day: self.log.info( 'Setting pilot status to Deleted after 4 days in Running' ) status = "Deleted" statusDate = statusTime + 4 * Time.day elif submittedDate: statusTime = Time.fromString( submittedDate ) delta = Time.dateTime() - statusTime if delta > 7 * Time.day: self.log.info( 'Setting pilot status to Deleted more than 7 days after submission still in Running' ) status = "Deleted" statusDate = statusTime + 7 * Time.day childRefs = [] childDicts = {} if isParent: for subjob in List.fromChar( job, ' Status info for the Job :' )[1:]: chRef = List.fromChar( subjob, '\n' )[0].strip() childDict = self.__parseJobStatus( subjob, gridType ) childRefs.append( chRef ) childDicts[chRef] = childDict return { 'Status': status, 'DestinationSite': destination, 'StatusDate': statusDate, 'isChild': isChild, 'isParent': isParent, 'ParentRef': False, 'FinalStatus' : status in self.finalStateList, 'ChildRefs' : childRefs, 'ChildDicts' : childDicts }
def getPilotStatus( self, proxy, gridType, pilotRefList ): """ Get GRID job status information using the job's owner proxy and GRID job IDs. Returns for each JobID its status in the GRID WMS and its destination CE as a tuple of 2 elements """ if gridType == 'LCG': cmd = [ 'edg-job-status' ] elif gridType == 'gLite': cmd = [ 'glite-wms-job-status' ] else: return S_ERROR() cmd.extend( pilotRefList ) start = time.time() ret = executeGridCommand( proxy, cmd, self.gridEnv ) self.log.info( '%s Job Status Execution Time for %d jobs:' % ( gridType, len( pilotRefList ) ), time.time() - start ) if not ret['OK']: self.log.error( 'Failed to execute %s Job Status' % gridType, ret['Message'] ) return S_ERROR() if ret['Value'][0] != 0: stderr = ret['Value'][2] stdout = ret['Value'][1] deleted = 0 resultDict = {} status = 'Deleted' destination = 'Unknown' deletedJobDict = { 'Status': status, 'DestinationSite': destination, 'StatusDate': Time.dateTime(), 'isChild': False, 'isParent': False, 'ParentRef': False, 'FinalStatus' : status in self.finalStateList, 'ChildRefs' : [] } # Glite returns this error for Deleted jobs to std.err for job in List.fromChar( stderr, '\nUnable to retrieve the status for:' )[1:]: pRef = List.fromChar( job, '\n' )[0].strip() resultDict[pRef] = deletedJobDict self.pilotDB.setPilotStatus( pRef, "Deleted" ) deleted += 1 # EDG returns a similar error for Deleted jobs to std.out for job in List.fromChar( stdout, '\nUnable to retrieve the status for:' )[1:]: pRef = List.fromChar( job, '\n' )[0].strip() if re.search( "No such file or directory: no matching jobs found", job ): resultDict[pRef] = deletedJobDict self.pilotDB.setPilotStatus( pRef, "Deleted" ) deleted += 1 if re.search( "edg_wll_JobStatus: Connection refused: edg_wll_ssl_connect()", job ): # the Broker is not accesible return S_ERROR( 'Broker not Available' ) if not deleted: self.log.error( 'Error executing %s Job Status:' % gridType, str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) ) return S_ERROR() return S_OK( resultDict ) stdout = ret['Value'][1] stderr = ret['Value'][2] resultDict = {} for job in List.fromChar( stdout, '\nStatus info for the Job :' )[1:]: pRef = List.fromChar( job, '\n' )[0].strip() resultDict[pRef] = self.__parseJobStatus( job, gridType ) return S_OK( resultDict )
def _prepareJDL( self, taskQueueDict, workingDirectory, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ ): """ Write JDL for Pilot Submission """ rbList = [] # Select Randomly one RB from the list rb = List.randomize( self.resourceBrokers )[0] rbList.append( '"https://%s:7443/glite_wms_wmproxy_server"' % rb ) lbList = [] for lb in self.loggingServers: lbList.append( '"https://%s:9000"' % lb ) lbList = List.randomize( lbList ) nPilots = 1 vo = gConfig.getValue( '/DIRAC/VirtualOrganization', '' ) if privateTQ or vo not in ['lhcb']: extraReq = "True" else: if submitPrivatePilot: extraReq = "! AllowsGenericPilot" else: extraReq = "AllowsGenericPilot" wmsClientJDL = """ RetryCount = 0; ShallowRetryCount = 0; MyProxyServer = "%s"; AllowsGenericPilot = Member( "VO-lhcb-pilot" , other.GlueHostApplicationSoftwareRunTimeEnvironment ); Requirements = pilotRequirements && %s; WmsClient = [ ErrorStorage = "%s/pilotError"; OutputStorage = "%s/pilotOutput"; # ListenerPort = 44000; ListenerStorage = "%s/Storage"; RetryCount = 0; ShallowRetryCount = 0; WMProxyEndPoints = { %s }; LBEndPoints = { %s }; MyProxyServer = "%s"; EnableServiceDiscovery = false; JdlDefaultAttributes = [ requirements = ( other.GlueCEStateStatus == "Production" || other.GlueCEStateStatus == "Special" ); AllowZippedISB = true; SignificantAttributes = {"Requirements", "Rank", "FuzzyRank"}; PerusalFileEnable = false; ]; ]; """ % ( self.myProxyServer, extraReq, workingDirectory, workingDirectory, workingDirectory, ', '.join( rbList ), ', '.join( lbList ), self.myProxyServer ) if pilotsToSubmit > 1: wmsClientJDL += """ JobType = "Parametric"; Parameters= %s; ParameterStep =1; ParameterStart = 0; """ % pilotsToSubmit nPilots = pilotsToSubmit ( pilotJDL , pilotRequirements ) = self._JobJDL( taskQueueDict, pilotOptions, ceMask ) jdl = os.path.join( workingDirectory, '%s.jdl' % taskQueueDict['TaskQueueID'] ) jdl = self._writeJDL( jdl, [pilotJDL, wmsClientJDL] ) return {'JDL':jdl, 'Requirements':pilotRequirements + " && " + extraReq, 'Pilots':nPilots, 'RB':rb }
'JobState': jobState, 'ConfigPath': self.ex_getProperty("section") } try: modInstance = self.__voPlugins[voPlugin](argsDict) result = modInstance.execute() except Exception, excp: self.jobLog.exception("Excp while executing %s" % voPlugin) return S_ERROR("Could not execute VO plugin %s: %s" % (voPlugin, excp)) if not result['OK']: return result extraPath = result['Value'] if type(extraPath) in types.StringTypes: extraPath = List.fromChar(result['Value']) return S_OK(extraPath) def optimizeJob(self, jid, jobState): result = jobState.getManifest() if not result['OK']: return result jobManifest = result['Value'] opChain = jobManifest.getOption("JobPath", []) if opChain: self.jobLog.info('Job defines its own optimizer chain %s' % opChain) return self.__setOptimizerChain(jobState, opChain) #Construct path opPath = self.ex_getOption('BasePath', ['JobPath', 'JobSanity']) voPlugin = self.ex_getOption('VOPlugin', '')
def checkJob(self, job, classAdJob): """This method controls the checking of the job. """ jobDesc = JobDescription() result = jobDesc.loadDescription(classAdJob.asJDL()) if not result['OK']: self.setFailedJob(job, result['Message'], classAdJob) return result self.__syncJobDesc(job, jobDesc, classAdJob) #Check if job defines a path itself # FIXME: only some group might be able to overwrite the jobPath jobPath = classAdJob.get_expression('JobPath').replace( '"', '').replace('Unknown', '') #jobPath = jobDesc.getVarWithDefault( 'JobPath' ).replace( 'Unknown', '' ) if jobPath: # HACK: Remove the { and } to ensure we have a simple string jobPath = jobPath.replace("{", "").replace("}", "") self.log.info('Job %s defines its own optimizer chain %s' % (job, jobPath)) return self.processJob(job, List.fromChar(jobPath)) #If no path, construct based on JDL and VO path module if present path = list(self.basePath) if self.voPlugin: argumentsDict = { 'JobID': job, 'ClassAd': classAdJob, 'ConfigPath': self.am_getModuleParam("section") } moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(self.voPlugin, argumentsDict) if not moduleInstance['OK']: self.log.error('Could not instantiate module:', '%s' % (self.voPlugin)) self.setFailedJob( job, 'Could not instantiate module: %s' % (self.voPlugin), classAdJob) return S_ERROR('Holding pending jobs') module = moduleInstance['Value'] result = module.execute() if not result['OK']: self.log.warn('Execution of %s failed' % (self.voPlugin)) return result extraPath = List.fromChar(result['Value']) if extraPath: path.extend(extraPath) self.log.verbose( 'Adding extra VO specific optimizers to path: %s' % (extraPath)) else: self.log.verbose('No VO specific plugin module specified') #Should only rely on an input data setting in absence of VO plugin result = self.jobDB.getInputData(job) if not result['OK']: self.log.error('Failed to get input data from JobDB', job) self.log.warn(result['Message']) return result if result['Value']: # if the returned tuple is not empty it will evaluate true self.log.info('Job %s has an input data requirement' % (job)) path.extend(self.inputData) else: self.log.info('Job %s has no input data requirement' % (job)) path.extend(self.endPath) self.log.info('Constructed path for job %s is: %s' % (job, path)) return self.processJob(job, path)