def test_CreateAndSubmit( self ): jobParams = {'JobID': '1', 'JobType': 'Merge', 'CPUTime': '1000000', 'Executable': '$DIRACROOT/scripts/dirac-jobexec', 'Arguments': "helloWorld.xml -o LogLevel=DEBUG pilot.cfg", 'ExtraOptions': 'pilot.cfg', 'InputSandbox': ['helloWorld.xml', 'exe-script.py']} resourceParams = {} optimizerParams = {} # res = createJobWrapper( 1, jobParams, resourceParams, optimizerParams, logLevel = 'DEBUG' ) # self.assert_( res['OK'] ) # wrapperFile = res['Value'] ceFactory = ComputingElementFactory() ceInstance = ceFactory.getCE( 'InProcess' ) self.assert_( ceInstance['OK'] ) computingElement = ceInstance['Value'] # res = computingElement.submitJob( wrapperFile, self.payloadProxy ) # self.assert_( res['OK'] ) res = createJobWrapper( 2, jobParams, resourceParams, optimizerParams, extraOptions = 'pilot.cfg', logLevel = 'DEBUG' ) self.assert_( res['OK'] ) wrapperFile = res['Value'] res = computingElement.submitJob( wrapperFile, self.payloadProxy ) self.assert_( res['OK'] )
def _setUpworkloadCE(self): """Get application queue and configure it :return: a ComputingElement instance """ # Get CE parameters workloadSite = gConfig.getValue("/LocalSite/Site") workloadCE = gConfig.getValue("/LocalSite/GridCE") workloadQueue = gConfig.getValue("/LocalSite/CEQueue") result = getQueue(workloadSite, workloadCE, workloadQueue) if not result["OK"]: return result ceType = result["Value"]["CEType"] ceParams = result["Value"] # Build CE ceFactory = ComputingElementFactory() result = ceFactory.getCE(ceName=workloadCE, ceType=ceType, ceParametersDict=ceParams) if not result["OK"]: return result workloadCE = result["Value"] # Add a proxy to the CE result = getProxyInfo() if not result["OK"] and not result["Value"]["chain"]: return result proxy = result["Value"]["chain"] result = proxy.getRemainingSecs() if not result["OK"]: return result lifetime_secs = result["Value"] workloadCE.setProxy(proxy, lifetime_secs) return S_OK(workloadCE)
def export_killPilot(self, pilotRefList ): """ Kill the specified pilots """ # Make a list if it is not yet pilotRefs = list( pilotRefList ) if type( pilotRefList ) in StringTypes: pilotRefs = [pilotRefList] # Regroup pilots per site and per owner pilotRefDict = {} for pilotReference in pilotRefs: result = pilotDB.getPilotInfo(pilotReference) if not result['OK'] or not result[ 'Value' ]: return S_ERROR('Failed to get info for pilot ' + pilotReference) pilotDict = result['Value'][pilotReference] owner = pilotDict['OwnerDN'] group = pilotDict['OwnerGroup'] queue = '@@@'.join( [owner, group, pilotDict['GridSite'], pilotDict['DestinationSite'], pilotDict['Queue']] ) gridType = pilotDict['GridType'] pilotRefDict.setdefault( queue, {} ) pilotRefDict[queue].setdefault( 'PilotList', [] ) pilotRefDict[queue]['PilotList'].append( pilotReference ) pilotRefDict[queue]['GridType'] = gridType # Do the work now queue by queue ceFactory = ComputingElementFactory() failed = [] for key, pilotDict in pilotRefDict.items(): owner,group,site,ce,queue = key.split( '@@@' ) result = getQueue( site, ce, queue ) if not result['OK']: return result queueDict = result['Value'] gridType = pilotDict['GridType'] result = ceFactory.getCE( gridType, ce, queueDict ) if not result['OK']: return result ce = result['Value'] if gridType in ["LCG","gLite","CREAM"]: group = getGroupOption(group,'VOMSRole',group) ret = gProxyManager.getPilotProxyFromVOMSGroup( owner, group ) if not ret['OK']: gLogger.error( ret['Message'] ) gLogger.error( 'Could not get proxy:', 'User "%s", Group "%s"' % ( owner, group ) ) return S_ERROR("Failed to get the pilot's owner proxy") proxy = ret['Value'] ce.setProxy( proxy ) pilotList = pilotDict['PilotList'] result = ce.killJob( pilotList ) if not result['OK']: failed.extend( pilotList ) if failed: return S_ERROR('Failed to kill at least some pilots') return S_OK()
def initialize(self, loops=0): """Sets default parameters and creates CE instance """ #Disable monitoring self.am_setOption('MonitoringEnabled', False) # self.log.setLevel('debug') #temporary for debugging self.am_setOption('MaxCycles', loops) ceUniqueID = self.am_getOption('CEUniqueID', 'InProcess') localCE = gConfig.getValue('/LocalSite/LocalCE', '') if localCE: self.log.info('Defining CE from local configuration = %s' % localCE) ceUniqueID = localCE ceFactory = ComputingElementFactory() self.ceName = ceUniqueID ceInstance = ceFactory.getCE(ceUniqueID) if not ceInstance['OK']: self.log.warn(ceInstance['Message']) return ceInstance self.computingElement = ceInstance['Value'] self.diracRoot = os.path.dirname( os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) #Localsite options self.siteRoot = gConfig.getValue('/LocalSite/Root', os.getcwd()) self.siteName = gConfig.getValue('/LocalSite/Site', 'Unknown') self.pilotReference = gConfig.getValue('/LocalSite/PilotReference', 'Unknown') self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', 86400 * 5) #Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor', 0.0) defaultWrapperLocation = 'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py' self.jobWrapperTemplate = os.path.join( self.diracRoot, self.am_getOption('JobWrapperTemplate', defaultWrapperLocation)) self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', 10) self.defaultLogLevel = self.am_getOption('DefaultLogLevel', 'info') self.fillingMode = self.am_getOption('FillingModeFlag', False) self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', True) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', 10) self.jobCount = 0 self.matchFailedCount = 0 #Timeleft self.timeLeftUtil = TimeLeft() self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0) self.gridCEQueue = gConfig.getValue( '/Resources/Computing/CEDefaults/GridCEQueue', '') self.timeLeftError = '' self.scaledCPUTime = 0.0 self.pilotInfoReportedFlag = False return S_OK()
def initialize(self, loops=0): """Sets default parameters and creates CE instance """ # Disable monitoring self.am_setOption('MonitoringEnabled', False) # self.log.setLevel('debug') #temporary for debugging self.am_setOption('MaxCycles', loops) ceType = self.am_getOption('CEType', 'InProcess') localCE = gConfig.getValue('/LocalSite/LocalCE', '') if localCE: self.log.info('Defining CE from local configuration = %s' % localCE) ceType = localCE # Create backend Computing Element ceFactory = ComputingElementFactory() self.ceName = ceType ceInstance = ceFactory.getCE(ceType) if not ceInstance['OK']: self.log.warn(ceInstance['Message']) return ceInstance self.computingElement = ceInstance['Value'] result = self.computingElement.getDescription() if not result['OK']: self.log.warn("Can not get the CE description") return result ceDict = result['Value'] self.timeLeft = ceDict.get('CPUTime', self.timeLeft) self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft) self.initTimes = os.times() # Localsite options self.siteName = gConfig.getValue('/LocalSite/Site', self.siteName) self.pilotReference = gConfig.getValue('/LocalSite/PilotReference', self.pilotReference) self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', self.defaultProxyLength) # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor', self.cpuFactor) self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', self.jobSubmissionDelay) self.fillingMode = self.am_getOption('FillingModeFlag', self.fillingMode) self.minimumTimeLeft = self.am_getOption('MinimumTimeLeft', self.minimumTimeLeft) self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', self.stopOnApplicationFailure) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) self.extraOptions = gConfig.getValue( '/AgentJobRequirements/ExtraOptions', self.extraOptions) # Timeleft self.timeLeftUtil = TimeLeft() return S_OK()
def getQueues( self, resourceDict ): """ Get the list of relevant CEs and their descriptions """ self.queueDict = {} ceFactory = ComputingElementFactory() for site in resourceDict: for ce in resourceDict[site]: ceDict = resourceDict[site][ce] qDict = ceDict.pop( 'Queues' ) for queue in qDict: queueName = '%s_%s' % ( ce, queue ) self.queueDict[queueName] = {} self.queueDict[queueName]['ParametersDict'] = qDict[queue] self.queueDict[queueName]['ParametersDict']['Queue'] = queue self.queueDict[queueName]['ParametersDict']['Site'] = site self.queueDict[queueName]['ParametersDict']['GridEnv'] = self.gridEnv self.queueDict[queueName]['ParametersDict']['Setup'] = gConfig.getValue( '/DIRAC/Setup', 'unknown' ) # Evaluate the CPU limit of the queue according to the Glue convention # To Do: should be a utility if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \ "SI00" in self.queueDict[queueName]['ParametersDict']: maxCPUTime = float( self.queueDict[queueName]['ParametersDict']['maxCPUTime'] ) # For some sites there are crazy values in the CS maxCPUTime = max( maxCPUTime, 0 ) maxCPUTime = min( maxCPUTime, 86400 * 12.5 ) si00 = float( self.queueDict[queueName]['ParametersDict']['SI00'] ) queueCPUTime = 60. / 250. * maxCPUTime * si00 self.queueDict[queueName]['ParametersDict']['CPUTime'] = int( queueCPUTime ) qwDir = os.path.join( self.workingDirectory, queue ) if not os.path.exists( qwDir ): os.makedirs( qwDir ) self.queueDict[queueName]['ParametersDict']['WorkingDirectory'] = qwDir ceQueueDict = dict( ceDict ) ceQueueDict.update( self.queueDict[queueName]['ParametersDict'] ) result = ceFactory.getCE( ceName = ce, ceType = ceDict['CEType'], ceParametersDict = ceQueueDict ) if not result['OK']: return result self.queueDict[queueName]['CE'] = result['Value'] self.queueDict[queueName]['CEName'] = ce self.queueDict[queueName]['CEType'] = ceDict['CEType'] self.queueDict[queueName]['Site'] = site self.queueDict[queueName]['QueueName'] = queue result = self.queueDict[queueName]['CE'].isValid() if not result['OK']: self.log.fatal( result['Message'] ) return result if 'BundleProxy' in self.queueDict[queueName]['ParametersDict']: self.queueDict[queueName]['BundleProxy'] = True return S_OK()
def __createCE( self, ceName ): """ return a CE object for the given ceName """ self.log.info( "Creating %s CE" % ( ceName ) ) ceFactory = ComputingElementFactory() ret = ceFactory.getCE( ceName ) if not ret['OK']: self.log.warn( ret['Message'] ) return ret return ret
def initialize( self, loops = 0 ): """Sets default parameters and creates CE instance """ # Disable monitoring self.am_setOption( 'MonitoringEnabled', False ) # self.log.setLevel('debug') #temporary for debugging self.am_setOption( 'MaxCycles', loops ) ceType = self.am_getOption( 'CEType', 'InProcess' ) localCE = gConfig.getValue( '/LocalSite/LocalCE', '' ) if localCE: self.log.info( 'Defining CE from local configuration = %s' % localCE ) ceType = localCE # Create backend Computing Element ceFactory = ComputingElementFactory() self.ceName = ceType ceInstance = ceFactory.getCE( ceType ) if not ceInstance['OK']: self.log.warn( ceInstance['Message'] ) return ceInstance self.computingElement = ceInstance['Value'] result = self.computingElement.getDescription() if not result['OK']: self.log.warn( "Can not get the CE description" ) return result ceDict = result['Value'] self.timeLeft = ceDict.get( 'CPUTime', 0.0 ) self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft ) self.initTimes = os.times() # Localsite options self.siteName = gConfig.getValue( '/LocalSite/Site', 'Unknown' ) self.pilotReference = gConfig.getValue( '/LocalSite/PilotReference', 'Unknown' ) self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', 86400 * 5 ) # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue( '/LocalSite/CPUNormalizationFactor', 0.0 ) self.jobSubmissionDelay = self.am_getOption( 'SubmissionDelay', 10 ) self.fillingMode = self.am_getOption( 'FillingModeFlag', False ) self.minimumTimeLeft = self.am_getOption( 'MinimumTimeLeft', 1000 ) self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', True ) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', 10 ) self.jobCount = 0 self.matchFailedCount = 0 self.extraOptions = gConfig.getValue( '/AgentJobRequirements/ExtraOptions', '' ) # Timeleft self.timeLeftUtil = TimeLeft() self.timeLeftError = '' self.scaledCPUTime = 0.0 self.pilotInfoReportedFlag = False return S_OK()
def initialize(self, loops=0): """Sets default parameters and creates CE instance """ # Disable monitoring self.am_setOption('MonitoringEnabled', False) # self.log.setLevel('debug') #temporary for debugging self.am_setOption('MaxCycles', loops) ceType = self.am_getOption('CEType', 'InProcess') localCE = gConfig.getValue('/LocalSite/LocalCE', '') if localCE: self.log.info('Defining CE from local configuration = %s' % localCE) ceType = localCE ceFactory = ComputingElementFactory() self.ceName = ceType ceInstance = ceFactory.getCE(ceType) if not ceInstance['OK']: self.log.warn(ceInstance['Message']) return ceInstance self.initTimes = os.times() self.computingElement = ceInstance['Value'] #Localsite options self.siteName = gConfig.getValue('/LocalSite/Site', 'Unknown') self.pilotReference = gConfig.getValue('/LocalSite/PilotReference', 'Unknown') self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', 86400 * 5) #Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor', 0.0) self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', 10) self.fillingMode = self.am_getOption('FillingModeFlag', False) self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', True) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', 10) self.jobCount = 0 self.matchFailedCount = 0 self.extraOptions = gConfig.getValue( '/AgentJobRequirements/ExtraOptions', '') #Timeleft self.timeLeftUtil = TimeLeft() self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0) self.timeLeftError = '' self.scaledCPUTime = 0.0 self.pilotInfoReportedFlag = False return S_OK()
def initialize( self, loops = 0 ): """Sets default parameters and creates CE instance """ # Disable monitoring self.am_setOption( 'MonitoringEnabled', False ) # self.log.setLevel('debug') #temporary for debugging self.am_setOption( 'MaxCycles', loops ) ceType = self.am_getOption( 'CEType', 'InProcess' ) localCE = gConfig.getValue( '/LocalSite/LocalCE', '' ) if localCE: self.log.info( 'Defining CE from local configuration = %s' % localCE ) ceType = localCE ceFactory = ComputingElementFactory() self.ceName = ceType ceInstance = ceFactory.getCE( ceType ) if not ceInstance['OK']: self.log.warn( ceInstance['Message'] ) return ceInstance self.initTimes = os.times() self.computingElement = ceInstance['Value'] self.diracRoot = os.path.dirname( os.path.dirname( os.path.dirname( os.path.dirname( __file__ ) ) ) ) #Localsite options self.siteRoot = gConfig.getValue( '/LocalSite/Root', os.getcwd() ) self.siteName = gConfig.getValue( '/LocalSite/Site', 'Unknown' ) self.pilotReference = gConfig.getValue( '/LocalSite/PilotReference', 'Unknown' ) self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', 86400 * 5 ) #Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue( '/LocalSite/CPUNormalizationFactor', 0.0 ) defaultWrapperLocation = 'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py' self.jobWrapperTemplate = os.path.join( self.diracRoot, self.am_getOption( 'JobWrapperTemplate', defaultWrapperLocation ) ) self.jobSubmissionDelay = self.am_getOption( 'SubmissionDelay', 10 ) self.defaultLogLevel = self.am_getOption( 'DefaultLogLevel', 'info' ) self.fillingMode = self.am_getOption( 'FillingModeFlag', False ) self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', True ) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', 10 ) self.jobCount = 0 self.matchFailedCount = 0 #Timeleft self.timeLeftUtil = TimeLeft() self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0 ) self.gridCEQueue = gConfig.getValue( '/Resources/Computing/CEDefaults/GridCEQueue', '' ) self.timeLeftError = '' self.scaledCPUTime = 0.0 self.pilotInfoReportedFlag = False return S_OK()
def addComputingElement(self, ceList): """ Check if a CE object for the current CE is available, instantiate one if necessary """ for CE in ceList: if CE not in self.computingElementDict: ceFactory = ComputingElementFactory( ) ceInstance = ceFactory.getCE( ceName = CE ) if not ceInstance['OK']: self.log.error('Can not create CE object:', ceInstance['Message']) return self.computingElementDict[CE] = ceInstance['Value'].ceConfigDict # add the 'CE' instance at the end to avoid being overwritten self.computingElementDict[CE]['CE'] = ceInstance['Value']
def _initializeComputingElement(self, localCE): """Generate a ComputingElement and configure it""" ceFactory = ComputingElementFactory() self.ceName = localCE.split("/")[ 0] # It might be "Pool/Singularity", or simply "Pool" self.innerCESubmissionType = (localCE.split("/")[1] if len( localCE.split("/")) == 2 else self.innerCESubmissionType) ceInstance = ceFactory.getCE(self.ceName) if not ceInstance["OK"]: self.log.warn("Can't instantiate a CE", ceInstance["Message"]) return ceInstance self.computingElement = ceInstance["Value"] self.computingElement.setParameters( {"InnerCESubmissionType": self.innerCESubmissionType}) return S_OK()
def getCREAMPilotOutput(proxy,pilotRef,pilotStamp): """ """ gridEnv = getGridEnv() tmpdir = mkdtemp() result = ComputingElementFactory().getCE(ceName = 'CREAMSite',ceType = 'CREAM', ceParametersDict = {'GridEnv':gridEnv, 'Queue':'Qeuue', 'OutputURL':"gsiftp://localhost", 'WorkingDirectory':tmpdir}) if not result['OK']: shutil.rmtree(tmpdir) return result ce = result['Value'] ce.reset() ce.setProxy(proxy) fullPilotRef = ":::".join([pilotRef,pilotStamp]) result = ce.getJobOutput( fullPilotRef ) shutil.rmtree(tmpdir) if not result['OK']: return S_ERROR( 'Failed to get pilot output: %s' % result['Message'] ) output, error = result['Value'] fileList = outputSandboxFiles result = S_OK() result['FileList'] = fileList result['StdOut'] = output result['StdErr'] = error return result
def getPilotCE(pilotDict): """Instantiate and return a CE bound to a pilot""" ceFactory = ComputingElementFactory() result = getQueue(pilotDict["GridSite"], pilotDict["DestinationSite"], pilotDict["Queue"]) if not result["OK"]: return result queueDict = result["Value"] gridEnv = getGridEnv() queueDict["GridEnv"] = gridEnv queueDict["WorkingDirectory"] = mkdtemp() result = ceFactory.getCE(pilotDict["GridType"], pilotDict["DestinationSite"], queueDict) if not result["OK"]: shutil.rmtree(queueDict["WorkingDirectory"]) return result ce = result["Value"] return S_OK(ce)
def test_CreateAndSubmit(self): jobParams = { "JobID": "1", "JobType": "Merge", "CPUTime": "1000000", "Executable": "dirac-jobexec", "Arguments": "helloWorld.xml -o LogLevel=DEBUG --cfg pilot.cfg", "InputSandbox": ["helloWorld.xml", "exe-script.py"], } resourceParams = {} optimizerParams = {} # res = createJobWrapper( 1, jobParams, resourceParams, optimizerParams, logLevel = 'DEBUG' ) # self.assertTrue( res['OK'] ) # wrapperFile = res['Value'] ceFactory = ComputingElementFactory() ceInstance = ceFactory.getCE("InProcess") self.assertTrue(ceInstance["OK"]) computingElement = ceInstance["Value"] # res = computingElement.submitJob( wrapperFile, self.payloadProxy ) # self.assertTrue( res['OK'] ) if "pilot.cfg" in os.listdir("."): jobParams.setdefault("ExtraOptions", "pilot.cfg") res = createJobWrapper(2, jobParams, resourceParams, optimizerParams, extraOptions="pilot.cfg", logLevel="DEBUG") else: res = createJobWrapper(2, jobParams, resourceParams, optimizerParams, logLevel="DEBUG") self.assertTrue(res["OK"], res.get("Message")) wrapperFile = res["Value"][0] res = computingElement.submitJob(wrapperFile, self.payloadProxy) self.assertTrue(res["OK"], res.get("Message"))
def killPilotsInQueues(pilotRefDict): """kill pilots queue by queue :params dict pilotRefDict: a dict of pilots in queues """ ceFactory = ComputingElementFactory() failed = [] for key, pilotDict in pilotRefDict.items(): owner, group, site, ce, queue = key.split("@@@") result = getQueue(site, ce, queue) if not result["OK"]: return result queueDict = result["Value"] gridType = pilotDict["GridType"] result = ceFactory.getCE(gridType, ce, queueDict) if not result["OK"]: return result ce = result["Value"] # FIXME: quite hacky. Should be either removed, or based on some flag if gridType in ["CREAM", "ARC", "Globus", "HTCondorCE"]: group = getGroupOption(group, "VOMSRole", group) ret = gProxyManager.getPilotProxyFromVOMSGroup(owner, group) if not ret["OK"]: gLogger.error( "Could not get proxy:", 'User "%s" Group "%s" : %s' % (owner, group, ret["Message"])) return S_ERROR("Failed to get the pilot's owner proxy") proxy = ret["Value"] ce.setProxy(proxy) pilotList = pilotDict["PilotList"] result = ce.killJob(pilotList) if not result["OK"]: failed.extend(pilotList) return failed
def test__getCEDict(mocker, ceType, expectedType, expectedNumberElement): """Test JobAgent()._getCEDict()""" mocker.patch( "DIRAC.WorkloadManagementSystem.Agent.JobAgent.AgentModule.__init__") jobAgent = JobAgent("Test", "Test1") jobAgent.log = gLogger jobAgent.log.setLevel("DEBUG") result = ComputingElementFactory().getCE(ceType) assert result["OK"] ce = result["Value"] ce.ceParameters["MultiProcessorStrategy"] = True ce.ceParameters["NumberOfProcessors"] = 4 result = jobAgent._getCEDict(ce) assert result["OK"] ceDict = result["Value"] assert isinstance(ceDict, expectedType) assert len(ceDict) == expectedNumberElement
def test__checkCEAvailability(mocker, ceType, mockCEReply, expectedResult): """Test JobAgent()._checkAvailability()""" mocker.patch( "DIRAC.WorkloadManagementSystem.Agent.JobAgent.AgentModule.__init__") mocker.patch( "DIRAC.Resources.Computing.ComputingElement.ComputingElement.available", return_value=mockCEReply) jobAgent = JobAgent("Test", "Test1") jobAgent.log = gLogger jobAgent.log.setLevel("DEBUG") result = ComputingElementFactory().getCE(ceType) assert result["OK"] ce = result["Value"] result = jobAgent._checkCEAvailability(ce) assert result["OK"] == expectedResult["OK"] if "Value" in expectedResult: assert result["Value"] == expectedResult["Value"] if "Message" in expectedResult: assert result["Message"] == expectedResult["Message"]
def getQueues(self, resourceDict): """ Get the list of relevant CEs and their descriptions """ self.queueDict = {} ceFactory = ComputingElementFactory() for site in resourceDict: for ce in resourceDict[site]: ceDict = resourceDict[site][ce] qDict = ceDict.pop("Queues") for queue in qDict: queueName = "%s_%s" % (ce, queue) self.queueDict[queueName] = {} self.queueDict[queueName]["ParametersDict"] = qDict[queue] self.queueDict[queueName]["ParametersDict"]["Queue"] = queue self.queueDict[queueName]["ParametersDict"]["Site"] = site self.queueDict[queueName]["ParametersDict"]["GridEnv"] = self.gridEnv self.queueDict[queueName]["ParametersDict"]["Setup"] = gConfig.getValue("/DIRAC/Setup", "unknown") # Evaluate the CPU limit of the queue according to the Glue convention # To Do: should be a utility if ( "maxCPUTime" in self.queueDict[queueName]["ParametersDict"] and "SI00" in self.queueDict[queueName]["ParametersDict"] ): maxCPUTime = float(self.queueDict[queueName]["ParametersDict"]["maxCPUTime"]) # For some sites there are crazy values in the CS maxCPUTime = max(maxCPUTime, 0) maxCPUTime = min(maxCPUTime, 86400 * 12.5) si00 = float(self.queueDict[queueName]["ParametersDict"]["SI00"]) queueCPUTime = 60.0 / 250.0 * maxCPUTime * si00 self.queueDict[queueName]["ParametersDict"]["CPUTime"] = int(queueCPUTime) qwDir = os.path.join(self.workingDirectory, queue) if not os.path.exists(qwDir): os.makedirs(qwDir) self.queueDict[queueName]["ParametersDict"]["WorkingDirectory"] = qwDir platform = "" if "Platform" in self.queueDict[queueName]["ParametersDict"]: platform = self.queueDict[queueName]["ParametersDict"]["Platform"] elif "Platform" in ceDict: platform = ceDict["Platform"] elif "OS" in ceDict: architecture = ceDict.get("architecture", "x86_64") OS = ceDict["OS"] platform = "_".join([architecture, OS]) if platform and not platform in self.platforms: self.platforms.append(platform) if not "Platform" in self.queueDict[queueName]["ParametersDict"] and platform: result = Resources.getDIRACPlatform(platform) if result["OK"]: self.queueDict[queueName]["ParametersDict"]["Platform"] = result["Value"] ceQueueDict = dict(ceDict) ceQueueDict.update(self.queueDict[queueName]["ParametersDict"]) result = ceFactory.getCE(ceName=ce, ceType=ceDict["CEType"], ceParametersDict=ceQueueDict) if not result["OK"]: return result self.queueDict[queueName]["CE"] = result["Value"] self.queueDict[queueName]["CEName"] = ce self.queueDict[queueName]["CEType"] = ceDict["CEType"] self.queueDict[queueName]["Site"] = site self.queueDict[queueName]["QueueName"] = queue result = self.queueDict[queueName]["CE"].isValid() if not result["OK"]: self.log.fatal(result["Message"]) return result if "BundleProxy" in self.queueDict[queueName]["ParametersDict"]: self.queueDict[queueName]["BundleProxy"] = True elif "BundleProxy" in ceDict: self.queueDict[queueName]["BundleProxy"] = True if site not in self.sites: self.sites.append(site) return S_OK()
def getQueues( self, resourceDict ): """ Get the list of relevant CEs and their descriptions """ self.queueDict = {} ceFactory = ComputingElementFactory() for site in resourceDict: for ce in resourceDict[site]: ceDict = resourceDict[site][ce] ceTags = ceDict.get( 'Tag' ) if isinstance( ceTags, basestring ): ceTags = fromChar( ceTags ) qDict = ceDict.pop( 'Queues' ) for queue in qDict: queueName = '%s_%s' % ( ce, queue ) self.queueDict[queueName] = {} self.queueDict[queueName]['ParametersDict'] = qDict[queue] self.queueDict[queueName]['ParametersDict']['Queue'] = queue self.queueDict[queueName]['ParametersDict']['Site'] = site self.queueDict[queueName]['ParametersDict']['GridEnv'] = self.gridEnv self.queueDict[queueName]['ParametersDict']['Setup'] = gConfig.getValue( '/DIRAC/Setup', 'unknown' ) # Evaluate the CPU limit of the queue according to the Glue convention # To Do: should be a utility if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \ "SI00" in self.queueDict[queueName]['ParametersDict']: maxCPUTime = float( self.queueDict[queueName]['ParametersDict']['maxCPUTime'] ) # For some sites there are crazy values in the CS maxCPUTime = max( maxCPUTime, 0 ) maxCPUTime = min( maxCPUTime, 86400 * 12.5 ) si00 = float( self.queueDict[queueName]['ParametersDict']['SI00'] ) queueCPUTime = 60. / 250. * maxCPUTime * si00 self.queueDict[queueName]['ParametersDict']['CPUTime'] = int( queueCPUTime ) queueTags = self.queueDict[queueName]['ParametersDict'].get( 'Tag' ) if queueTags and isinstance( queueTags, basestring ): queueTags = fromChar( queueTags ) self.queueDict[queueName]['ParametersDict']['Tag'] = queueTags if ceTags: if queueTags: allTags = list( set( ceTags + queueTags ) ) self.queueDict[queueName]['ParametersDict']['Tag'] = allTags else: self.queueDict[queueName]['ParametersDict']['Tag'] = ceTags maxMemory = self.queueDict[queueName]['ParametersDict'].get( 'MaxRAM', None ) if maxMemory: # MaxRAM value is supposed to be in MB maxMemoryList = range( 1, int( maxMemory )/1000 + 1 ) memoryTags = [ '%dGB' % mem for mem in maxMemoryList ] if memoryTags: self.queueDict[queueName]['ParametersDict'].setdefault( 'Tag', [] ) self.queueDict[queueName]['ParametersDict']['Tag'] += memoryTags qwDir = os.path.join( self.workingDirectory, queue ) if not os.path.exists( qwDir ): os.makedirs( qwDir ) self.queueDict[queueName]['ParametersDict']['WorkingDirectory'] = qwDir platform = '' if "Platform" in self.queueDict[queueName]['ParametersDict']: platform = self.queueDict[queueName]['ParametersDict']['Platform'] elif "Platform" in ceDict: platform = ceDict['Platform'] elif "OS" in ceDict: architecture = ceDict.get( 'architecture', 'x86_64' ) OS = ceDict['OS'] platform = '_'.join( [architecture, OS] ) if platform and not platform in self.platforms: self.platforms.append( platform ) if not "Platform" in self.queueDict[queueName]['ParametersDict'] and platform: result = Resources.getDIRACPlatform( platform ) if result['OK']: self.queueDict[queueName]['ParametersDict']['Platform'] = result['Value'][0] ceQueueDict = dict( ceDict ) ceQueueDict.update( self.queueDict[queueName]['ParametersDict'] ) # Generate the CE object for the queue or pick the already existing one # if the queue definition did not change queueHash = self.__generateQueueHash( ceQueueDict ) if queueName in self.queueCECache and self.queueCECache[queueName]['Hash'] == queueHash: queueCE = self.queueCECache[queueName]['CE'] else: result = ceFactory.getCE( ceName = ce, ceType = ceDict['CEType'], ceParametersDict = ceQueueDict ) if not result['OK']: return result self.queueCECache.setdefault( queueName, {} ) self.queueCECache[queueName]['Hash'] = queueHash self.queueCECache[queueName]['CE'] = result['Value'] queueCE = self.queueCECache[queueName]['CE'] self.queueDict[queueName]['CE'] = queueCE self.queueDict[queueName]['CEName'] = ce self.queueDict[queueName]['CEType'] = ceDict['CEType'] self.queueDict[queueName]['Site'] = site self.queueDict[queueName]['QueueName'] = queue self.queueDict[queueName]['Platform'] = platform result = self.queueDict[queueName]['CE'].isValid() if not result['OK']: self.log.fatal( result['Message'] ) return result if 'BundleProxy' in self.queueDict[queueName]['ParametersDict']: if self.queueDict[queueName]['ParametersDict']['BundleProxy'].lower() in ['true','yes','1']: self.queueDict[queueName]['BundleProxy'] = True elif 'BundleProxy' in ceDict: if ceDict['BundleProxy'].lower() in ['true','yes','1']: self.queueDict[queueName]['BundleProxy'] = True if site not in self.sites: self.sites.append( site ) return S_OK()
def getQueues( self ): """ Get the list of relevant CEs and their descriptions """ ceFactory = ComputingElementFactory() ceTypes = self.am_getOption( 'CETypes', [] ) ceConfList = self.am_getOption( 'CEs', [] ) for siteName in self.siteNames: # Look up CE definitions in the site CS description ceList = [] gridType = siteName.split( '.' )[0] result = gConfig.getSections( '/Resources/Sites/%s/%s/CEs' % ( gridType, siteName ) ) if not result['OK']: return S_ERROR( 'Failed to look up the CS for the site %s CEs' % siteName ) if not result['Value']: return S_ERROR( 'No CEs found for site %s' % siteName ) ceTotalList = result['Value'] for ce in ceTotalList: if ( ceConfList and ce in ceConfList ) or not ceConfList: ceType = gConfig.getValue( '/Resources/Sites/%s/%s/CEs/%s/CEType' % ( gridType, siteName, ce ), 'Unknown' ) result = gConfig.getOptionsDict( '/Resources/Sites/%s/%s/CEs/%s' % ( gridType, siteName, ce ) ) if not result['OK']: return S_ERROR( 'Failed to look up the CS for ce %s' % ce ) ceDict = result['Value'] if "SubmissionMode" in ceDict and ceDict['SubmissionMode'].lower() == "direct": if ceType in ceTypes: ceList.append( ( ce, ceType, ceDict ) ) for ce, ceType, ceDict in ceList: section = '/Resources/Sites/%s/%s/CEs/%s/Queues' % ( gridType, siteName, ce ) result = gConfig.getSections( section ) if not result['OK']: return S_ERROR( 'Failed to look up the CS for queues' ) if not result['Value']: return S_ERROR( 'No Queues found for site %s, ce %s' % ( siteName, ce ) ) queues = result['Value'] for queue in queues: result = gConfig.getOptionsDict( '%s/%s' % ( section, queue ) ) if not result['OK']: return S_ERROR( 'Failed to look up the CS for ce,queue %s,%s' % ( ce, queue ) ) queueName = '%s_%s' % ( ce, queue ) self.queueDict[queueName] = {} self.queueDict[queueName]['ParametersDict'] = result['Value'] self.queueDict[queueName]['ParametersDict']['Queue'] = queue self.queueDict[queueName]['ParametersDict']['Site'] = siteName self.queueDict[queueName]['ParametersDict']['GridEnv'] = self.gridEnv self.queueDict[queueName]['ParametersDict']['Setup'] = gConfig.getValue( '/DIRAC/Setup', 'unknown' ) # Evaluate the CPU limit of the queue according to the Glue convention # To Do: should be a utility if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \ "SI00" in self.queueDict[queueName]['ParametersDict']: maxCPUTime = float( self.queueDict[queueName]['ParametersDict']['maxCPUTime'] ) # For some sites there are crazy values in the CS maxCPUTime = max( maxCPUTime, 0 ) maxCPUTime = min( maxCPUTime, 86400 * 12.5 ) si00 = float( self.queueDict[queueName]['ParametersDict']['SI00'] ) queueCPUTime = 60. / 250. * maxCPUTime * si00 self.queueDict[queueName]['ParametersDict']['CPUTime'] = int( queueCPUTime ) qwDir = os.path.join( self.workingDirectory, queue ) if not os.path.exists( qwDir ): os.mkdir( qwDir ) self.queueDict[queueName]['ParametersDict']['WorkingDirectory'] = qwDir queueDict = dict( ceDict ) queueDict.update( self.queueDict[queueName]['ParametersDict'] ) result = ceFactory.getCE( ceName = ce, ceType = ceType, ceParametersDict = queueDict ) if not result['OK']: return result self.queueDict[queueName]['CE'] = result['Value'] self.queueDict[queueName]['CEName'] = ce self.queueDict[queueName]['CEType'] = ceType self.queueDict[queueName]['Site'] = siteName self.queueDict[queueName]['QueueName'] = queue result = self.queueDict[queueName]['CE'].isValid() if not result['OK']: self.log.fatal( result['Message'] ) return result if 'BundleProxy' in self.queueDict[queueName]['ParametersDict']: self.queueDict[queueName]['BundleProxy'] = True return S_OK()
def initialize(self, loops=0): """Sets default parameters and creates CE instance """ # Disable monitoring, logLevel INFO, limited cycles self.am_setOption('MonitoringEnabled', False) self.am_setOption('MaxCycles', loops) ceType = self.am_getOption('CEType', self.ceName) localCE = gConfig.getValue('/LocalSite/LocalCE', '') if localCE: self.log.info('Defining CE from local configuration', '= %s' % localCE) ceType = localCE # Create backend Computing Element ceFactory = ComputingElementFactory() self.ceName = ceType.split('/')[ 0] # It might be "Pool/Singularity", or simply "Pool" self.innerCESubmissionType = ceType.split('/')[1] if len( ceType.split('/')) == 2 else self.innerCESubmissionType ceInstance = ceFactory.getCE(self.ceName) if not ceInstance['OK']: self.log.warn("Can't instantiate a CE", ceInstance['Message']) return ceInstance self.computingElement = ceInstance['Value'] self.computingElement.ceParameters[ 'InnerCESubmissionType'] = self.innerCESubmissionType result = self.computingElement.getDescription() if not result['OK']: self.log.warn("Can not get the CE description") return result if isinstance(result['Value'], list): ceDict = result['Value'][0] else: ceDict = result['Value'] self.timeLeft = ceDict.get('CPUTime', self.timeLeft) self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft) self.initTimes = os.times() # Localsite options self.siteName = gConfig.getValue('/LocalSite/Site', self.siteName) self.pilotReference = gConfig.getValue('/LocalSite/PilotReference', self.pilotReference) self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', self.defaultProxyLength) # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor', self.cpuFactor) self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', self.jobSubmissionDelay) self.fillingMode = self.am_getOption('FillingModeFlag', self.fillingMode) self.minimumTimeLeft = self.am_getOption('MinimumTimeLeft', self.minimumTimeLeft) self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', self.stopOnApplicationFailure) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) self.extraOptions = gConfig.getValue( '/AgentJobRequirements/ExtraOptions', self.extraOptions) # Timeleft self.timeLeftUtil = TimeLeft() return S_OK()
def __getGridJobOutput(pilotReference): """ Get the pilot job standard output and standard error files for the Grid job reference """ result = pilotDB.getPilotInfo(pilotReference) if not result['OK'] or not result['Value']: return S_ERROR('Failed to get info for pilot ' + pilotReference) pilotDict = result['Value'][pilotReference] owner = pilotDict['OwnerDN'] group = pilotDict['OwnerGroup'] # FIXME: What if the OutputSandBox is not StdOut and StdErr, what do we do with other files? result = pilotDB.getPilotOutput(pilotReference) if result['OK']: stdout = result['Value']['StdOut'] error = result['Value']['StdErr'] if stdout or error: resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = [] return S_OK(resultDict) else: gLogger.warn('Empty pilot output found for %s' % pilotReference) gridType = pilotDict['GridType'] if gridType == "gLite": result = getWMSPilotOutput(pilotReference, proxyUserDN=owner, proxyUserGroup=group) #pylint: disable=unexpected-keyword-arg if not result['OK']: return S_ERROR('Failed to get pilot output: ' + result['Message']) # FIXME: What if the OutputSandBox is not StdOut and StdErr, what do we do with other files? stdout = result['StdOut'] error = result['StdErr'] fileList = result['FileList'] if stdout: result = pilotDB.storePilotOutput(pilotReference, stdout, error) if not result['OK']: gLogger.error('Failed to store pilot output:', result['Message']) resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = fileList return S_OK(resultDict) else: # Instantiate the appropriate CE ceFactory = ComputingElementFactory() result = getQueue(pilotDict['GridSite'], pilotDict['DestinationSite'], pilotDict['Queue']) if not result['OK']: return result queueDict = result['Value'] gridEnv = getGridEnv() queueDict['GridEnv'] = gridEnv queueDict['WorkingDirectory'] = mkdtemp() result = ceFactory.getCE(gridType, pilotDict['DestinationSite'], queueDict) if not result['OK']: shutil.rmtree(queueDict['WorkingDirectory']) return result ce = result['Value'] groupVOMS = getGroupOption(group, 'VOMSRole', group) result = gProxyManager.getPilotProxyFromVOMSGroup(owner, groupVOMS) if not result['OK']: gLogger.error(result['Message']) gLogger.error('Could not get proxy:', 'User "%s", Group "%s"' % (owner, groupVOMS)) return S_ERROR("Failed to get the pilot's owner proxy") proxy = result['Value'] ce.setProxy(proxy) pilotStamp = pilotDict['PilotStamp'] pRef = pilotReference if pilotStamp: pRef = pRef + ':::' + pilotStamp result = ce.getJobOutput(pRef) if not result['OK']: shutil.rmtree(queueDict['WorkingDirectory']) return result stdout, error = result['Value'] if stdout: result = pilotDB.storePilotOutput(pilotReference, stdout, error) if not result['OK']: gLogger.error('Failed to store pilot output:', result['Message']) resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = [] shutil.rmtree(queueDict['WorkingDirectory']) return S_OK(resultDict)
def getQueuesResolved( siteDict, queueCECache, gridEnv=None, setup=None, workingDir="", checkPlatform=False, instantiateCEs=False ): """Get the list of relevant CEs (what is in siteDict) and their descriptions. The main goal of this method is to return a dictionary of queues """ queueDict = {} ceFactory = ComputingElementFactory() for site in siteDict: for ce in siteDict[site]: ceDict = siteDict[site][ce] pilotRunDirectory = ceDict.get("PilotRunDirectory", "") # ceMaxRAM = ceDict.get('MaxRAM', None) qDict = ceDict.pop("Queues") for queue in qDict: queueName = "%s_%s" % (ce, queue) queueDict[queueName] = {} queueDict[queueName]["ParametersDict"] = qDict[queue] queueDict[queueName]["ParametersDict"]["Queue"] = queue queueDict[queueName]["ParametersDict"]["GridCE"] = ce queueDict[queueName]["ParametersDict"]["Site"] = site queueDict[queueName]["ParametersDict"]["GridEnv"] = gridEnv queueDict[queueName]["ParametersDict"]["Setup"] = setup # Evaluate the CPU limit of the queue according to the Glue convention computeQueueCPULimit(queueDict[queueName]["ParametersDict"]) # Tags & RequiredTags defined on the Queue level and on the CE level are concatenated # This also converts them from a string to a list if required. resolveTags(ceDict, queueDict[queueName]["ParametersDict"]) # Some parameters can be defined on the CE level and are inherited by all Queues setAdditionalParams(ceDict, queueDict[queueName]["ParametersDict"]) if pilotRunDirectory: queueDict[queueName]["ParametersDict"]["JobExecDir"] = pilotRunDirectory ceQueueDict = dict(ceDict) ceQueueDict.update(queueDict[queueName]["ParametersDict"]) if instantiateCEs: # Generate the CE object for the queue or pick the already existing one # if the queue definition did not change queueHash = generateQueueHash(ceQueueDict) if queueName in queueCECache and queueCECache[queueName]["Hash"] == queueHash: queueCE = queueCECache[queueName]["CE"] else: result = ceFactory.getCE(ceName=ce, ceType=ceDict["CEType"], ceParametersDict=ceQueueDict) if not result["OK"]: queueDict.pop(queueName) continue queueCECache.setdefault(queueName, {}) queueCECache[queueName]["Hash"] = queueHash queueCECache[queueName]["CE"] = result["Value"] queueCE = queueCECache[queueName]["CE"] queueDict[queueName]["ParametersDict"].update(queueCE.ceParameters) queueDict[queueName]["CE"] = queueCE result = queueDict[queueName]["CE"].isValid() if not result["OK"]: queueDict.pop(queueName) queueCECache.pop(queueName) continue queueDict[queueName]["CEName"] = ce queueDict[queueName]["CEType"] = ceDict["CEType"] queueDict[queueName]["Site"] = site queueDict[queueName]["QueueName"] = queue queueDict[queueName]["QueryCEFlag"] = ceDict.get("QueryCEFlag", "false") if checkPlatform: setPlatform(ceDict, queueDict[queueName]["ParametersDict"]) bundleProxy = queueDict[queueName]["ParametersDict"].get("BundleProxy", ceDict.get("BundleProxy")) if bundleProxy and bundleProxy.lower() in ["true", "yes", "1"]: queueDict[queueName]["BundleProxy"] = True return S_OK(queueDict)
def getQueues( self, resourceDict ): """ Get the list of relevant CEs and their descriptions """ self.queueDict = {} ceFactory = ComputingElementFactory() for site in resourceDict: for ce in resourceDict[site]: ceDict = resourceDict[site][ce] ceTags = ceDict.get( 'Tag', [] ) pilotRunDirectory = ceDict.get( 'PilotRunDirectory', '' ) if isinstance( ceTags, basestring ): ceTags = fromChar( ceTags ) ceMaxRAM = ceDict.get( 'MaxRAM', None ) qDict = ceDict.pop( 'Queues' ) for queue in qDict: queueName = '%s_%s' % ( ce, queue ) self.queueDict[queueName] = {} self.queueDict[queueName]['ParametersDict'] = qDict[queue] self.queueDict[queueName]['ParametersDict']['Queue'] = queue self.queueDict[queueName]['ParametersDict']['Site'] = site self.queueDict[queueName]['ParametersDict']['GridEnv'] = self.gridEnv self.queueDict[queueName]['ParametersDict']['Setup'] = gConfig.getValue( '/DIRAC/Setup', 'unknown' ) # Evaluate the CPU limit of the queue according to the Glue convention # To Do: should be a utility if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \ "SI00" in self.queueDict[queueName]['ParametersDict']: maxCPUTime = float( self.queueDict[queueName]['ParametersDict']['maxCPUTime'] ) # For some sites there are crazy values in the CS maxCPUTime = max( maxCPUTime, 0 ) maxCPUTime = min( maxCPUTime, 86400 * 12.5 ) si00 = float( self.queueDict[queueName]['ParametersDict']['SI00'] ) queueCPUTime = 60. / 250. * maxCPUTime * si00 self.queueDict[queueName]['ParametersDict']['CPUTime'] = int( queueCPUTime ) queueTags = self.queueDict[queueName]['ParametersDict'].get( 'Tag' ) if queueTags and isinstance( queueTags, basestring ): queueTags = fromChar( queueTags ) self.queueDict[queueName]['ParametersDict']['Tag'] = queueTags if ceTags: if queueTags: allTags = list( set( ceTags + queueTags ) ) self.queueDict[queueName]['ParametersDict']['Tag'] = allTags else: self.queueDict[queueName]['ParametersDict']['Tag'] = ceTags maxRAM = self.queueDict[queueName]['ParametersDict'].get( 'MaxRAM' ) maxRAM = ceMaxRAM if not maxRAM else maxRAM if maxRAM: self.queueDict[queueName]['ParametersDict']['MaxRAM'] = maxRAM if pilotRunDirectory: self.queueDict[queueName]['ParametersDict']['JobExecDir'] = pilotRunDirectory qwDir = os.path.join( self.workingDirectory, queue ) mkDir(qwDir) self.queueDict[queueName]['ParametersDict']['WorkingDirectory'] = qwDir platform = '' if "Platform" in self.queueDict[queueName]['ParametersDict']: platform = self.queueDict[queueName]['ParametersDict']['Platform'] elif "Platform" in ceDict: platform = ceDict['Platform'] elif "OS" in ceDict: architecture = ceDict.get( 'architecture', 'x86_64' ) OS = ceDict['OS'] platform = '_'.join( [architecture, OS] ) if platform and not platform in self.platforms: self.platforms.append( platform ) if not "Platform" in self.queueDict[queueName]['ParametersDict'] and platform: result = Resources.getDIRACPlatform( platform ) if result['OK']: self.queueDict[queueName]['ParametersDict']['Platform'] = result['Value'][0] ceQueueDict = dict( ceDict ) ceQueueDict.update( self.queueDict[queueName]['ParametersDict'] ) # Generate the CE object for the queue or pick the already existing one # if the queue definition did not change queueHash = self.__generateQueueHash( ceQueueDict ) if queueName in self.queueCECache and self.queueCECache[queueName]['Hash'] == queueHash: queueCE = self.queueCECache[queueName]['CE'] else: result = ceFactory.getCE( ceName = ce, ceType = ceDict['CEType'], ceParametersDict = ceQueueDict ) if not result['OK']: return result self.queueCECache.setdefault( queueName, {} ) self.queueCECache[queueName]['Hash'] = queueHash self.queueCECache[queueName]['CE'] = result['Value'] queueCE = self.queueCECache[queueName]['CE'] self.queueDict[queueName]['CE'] = queueCE self.queueDict[queueName]['CEName'] = ce self.queueDict[queueName]['CEType'] = ceDict['CEType'] self.queueDict[queueName]['Site'] = site self.queueDict[queueName]['QueueName'] = queue self.queueDict[queueName]['Platform'] = platform result = self.queueDict[queueName]['CE'].isValid() if not result['OK']: self.log.fatal( result['Message'] ) return result if 'BundleProxy' in self.queueDict[queueName]['ParametersDict']: if self.queueDict[queueName]['ParametersDict']['BundleProxy'].lower() in ['true','yes','1']: self.queueDict[queueName]['BundleProxy'] = True elif 'BundleProxy' in ceDict: if ceDict['BundleProxy'].lower() in ['true','yes','1']: self.queueDict[queueName]['BundleProxy'] = True if site not in self.sites: self.sites.append( site ) return S_OK()
def __getGridJobOutput(self,pilotReference): """ Get the pilot job standard output and standard error files for the Grid job reference """ result = pilotDB.getPilotInfo(pilotReference) if not result['OK'] or not result[ 'Value' ]: return S_ERROR('Failed to get info for pilot ' + pilotReference) pilotDict = result['Value'][pilotReference] owner = pilotDict['OwnerDN'] group = pilotDict['OwnerGroup'] # FIXME: What if the OutputSandBox is not StdOut and StdErr, what do we do with other files? result = pilotDB.getPilotOutput(pilotReference) if result['OK']: stdout = result['Value']['StdOut'] error = result['Value']['StdErr'] if stdout or error: resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = [] return S_OK(resultDict) else: gLogger.warn( 'Empty pilot output found for %s' % pilotReference ) gridType = pilotDict['GridType'] if gridType in ["LCG","gLite","CREAM"]: group = getGroupOption(group,'VOMSRole',group) ret = gProxyManager.getPilotProxyFromVOMSGroup( owner, group ) if not ret['OK']: gLogger.error( ret['Message'] ) gLogger.error( 'Could not get proxy:', 'User "%s", Group "%s"' % ( owner, group ) ) return S_ERROR("Failed to get the pilot's owner proxy") proxy = ret['Value'] pilotStamp = pilotDict['PilotStamp'] result = getPilotOutput( proxy, gridType, pilotReference, pilotStamp ) if not result['OK']: return S_ERROR('Failed to get pilot output: '+result['Message']) # FIXME: What if the OutputSandBox is not StdOut and StdErr, what do we do with other files? stdout = result['StdOut'] error = result['StdErr'] fileList = result['FileList'] if stdout: result = pilotDB.storePilotOutput(pilotReference,stdout,error) if not result['OK']: gLogger.error('Failed to store pilot output:',result['Message']) resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = fileList return S_OK(resultDict) else: # Instantiate the appropriate CE ceFactory = ComputingElementFactory() result = getQueue( pilotDict['GridSite'], pilotDict['DestinationSite'], pilotDict['Queue'] ) if not result['OK']: return result queueDict = result['Value'] result = ceFactory.getCE( gridType, pilotDict['DestinationSite'], queueDict ) if not result['OK']: return result ce = result['Value'] pilotStamp = pilotDict['PilotStamp'] pRef = pilotReference if pilotStamp: pRef = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRef ) if not result['OK']: return result stdout,error = result['Value'] if stdout: result = pilotDB.storePilotOutput(pilotReference,stdout,error) if not result['OK']: gLogger.error('Failed to store pilot output:',result['Message']) resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = [] return S_OK( resultDict )
def export_killPilot(pilotRefList): """ Kill the specified pilots """ # Make a list if it is not yet pilotRefs = list(pilotRefList) if type(pilotRefList) in StringTypes: pilotRefs = [pilotRefList] # Regroup pilots per site and per owner pilotRefDict = {} for pilotReference in pilotRefs: result = pilotDB.getPilotInfo(pilotReference) if not result['OK'] or not result['Value']: return S_ERROR('Failed to get info for pilot ' + pilotReference) pilotDict = result['Value'][pilotReference] owner = pilotDict['OwnerDN'] group = pilotDict['OwnerGroup'] queue = '@@@'.join([ owner, group, pilotDict['GridSite'], pilotDict['DestinationSite'], pilotDict['Queue'] ]) gridType = pilotDict['GridType'] pilotRefDict.setdefault(queue, {}) pilotRefDict[queue].setdefault('PilotList', []) pilotRefDict[queue]['PilotList'].append(pilotReference) pilotRefDict[queue]['GridType'] = gridType # Do the work now queue by queue ceFactory = ComputingElementFactory() failed = [] for key, pilotDict in pilotRefDict.items(): owner, group, site, ce, queue = key.split('@@@') result = getQueue(site, ce, queue) if not result['OK']: return result queueDict = result['Value'] gridType = pilotDict['GridType'] result = ceFactory.getCE(gridType, ce, queueDict) if not result['OK']: return result ce = result['Value'] # FIXME: quite hacky. Should be either removed, or based on some flag if gridType in ["LCG", "gLite", "CREAM", "ARC", "Globus"]: group = getGroupOption(group, 'VOMSRole', group) ret = gProxyManager.getPilotProxyFromVOMSGroup(owner, group) if not ret['OK']: gLogger.error(ret['Message']) gLogger.error('Could not get proxy:', 'User "%s", Group "%s"' % (owner, group)) return S_ERROR("Failed to get the pilot's owner proxy") proxy = ret['Value'] ce.setProxy(proxy) pilotList = pilotDict['PilotList'] result = ce.killJob(pilotList) if not result['OK']: failed.extend(pilotList) if failed: return S_ERROR('Failed to kill at least some pilots') return S_OK()
def __getGridJobOutput(self, pilotReference): """ Get the pilot job standard output and standard error files for the Grid job reference """ result = pilotDB.getPilotInfo(pilotReference) if not result['OK'] or not result['Value']: return S_ERROR('Failed to get info for pilot ' + pilotReference) pilotDict = result['Value'][pilotReference] owner = pilotDict['OwnerDN'] group = pilotDict['OwnerGroup'] # FIXME: What if the OutputSandBox is not StdOut and StdErr, what do we do with other files? result = pilotDB.getPilotOutput(pilotReference) if result['OK']: stdout = result['Value']['StdOut'] error = result['Value']['StdErr'] if stdout or error: resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = [] return S_OK(resultDict) else: gLogger.warn('Empty pilot output found for %s' % pilotReference) gridType = pilotDict['GridType'] if gridType in ["LCG", "gLite", "CREAM"]: group = getGroupOption(group, 'VOMSRole', group) ret = gProxyManager.getPilotProxyFromVOMSGroup(owner, group) if not ret['OK']: gLogger.error(ret['Message']) gLogger.error('Could not get proxy:', 'User "%s", Group "%s"' % (owner, group)) return S_ERROR("Failed to get the pilot's owner proxy") proxy = ret['Value'] pilotStamp = pilotDict['PilotStamp'] result = getPilotOutput(proxy, gridType, pilotReference, pilotStamp) if not result['OK']: return S_ERROR('Failed to get pilot output: ' + result['Message']) # FIXME: What if the OutputSandBox is not StdOut and StdErr, what do we do with other files? stdout = result['StdOut'] error = result['StdErr'] fileList = result['FileList'] if stdout: result = pilotDB.storePilotOutput(pilotReference, stdout, error) if not result['OK']: gLogger.error('Failed to store pilot output:', result['Message']) resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = fileList return S_OK(resultDict) else: # Instantiate the appropriate CE ceFactory = ComputingElementFactory() result = Resources(group=group).getQueueDescription( pilotDict['Queue']) if not result['OK']: return result queueDict = result['Value'] result = ceFactory.getCE(gridType, pilotDict['DestinationSite'], queueDict) if not result['OK']: return result ce = result['Value'] pilotStamp = pilotDict['PilotStamp'] pRef = pilotReference if pilotStamp: pRef = pRef + ':::' + pilotStamp result = ce.getJobOutput(pRef) if not result['OK']: return result stdout, error = result['Value'] if stdout: result = pilotDB.storePilotOutput(pilotReference, stdout, error) if not result['OK']: gLogger.error('Failed to store pilot output:', result['Message']) resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = [] return S_OK(resultDict)
def getQueues(self): """ Get the list of relevant CEs and their descriptions """ self.queueDict = {} ceFactory = ComputingElementFactory() ceTypes = self.am_getOption('CETypes', []) ceConfList = self.am_getOption('CEs', []) for siteName in self.siteNames: # Look up CE definitions in the site CS description ceList = [] gridType = siteName.split('.')[0] result = gConfig.getSections('/Resources/Sites/%s/%s/CEs' % (gridType, siteName)) if not result['OK']: return S_ERROR('Failed to look up the CS for the site %s CEs' % siteName) if not result['Value']: return S_ERROR('No CEs found for site %s' % siteName) ceTotalList = result['Value'] for ce in ceTotalList: if (ceConfList and ce in ceConfList) or not ceConfList: ceType = gConfig.getValue( '/Resources/Sites/%s/%s/CEs/%s/CEType' % (gridType, siteName, ce), 'Unknown') result = gConfig.getOptionsDict( '/Resources/Sites/%s/%s/CEs/%s' % (gridType, siteName, ce)) if not result['OK']: return S_ERROR('Failed to look up the CS for ce %s' % ce) ceDict = result['Value'] if "SubmissionMode" in ceDict and ceDict[ 'SubmissionMode'].lower() == "direct": if ceType in ceTypes: ceList.append((ce, ceType, ceDict)) for ce, ceType, ceDict in ceList: section = '/Resources/Sites/%s/%s/CEs/%s/Queues' % ( gridType, siteName, ce) result = gConfig.getSections(section) if not result['OK']: return S_ERROR('Failed to look up the CS for queues') if not result['Value']: return S_ERROR('No Queues found for site %s, ce %s' % (siteName, ce)) queues = result['Value'] for queue in queues: result = gConfig.getOptionsDict('%s/%s' % (section, queue)) if not result['OK']: return S_ERROR( 'Failed to look up the CS for ce,queue %s,%s' % (ce, queue)) queueName = '%s_%s' % (ce, queue) self.queueDict[queueName] = {} self.queueDict[queueName]['ParametersDict'] = result[ 'Value'] self.queueDict[queueName]['ParametersDict'][ 'Queue'] = queue self.queueDict[queueName]['ParametersDict'][ 'Site'] = siteName self.queueDict[queueName]['ParametersDict'][ 'GridEnv'] = self.gridEnv self.queueDict[queueName]['ParametersDict'][ 'Setup'] = gConfig.getValue('/DIRAC/Setup', 'unknown') # Evaluate the CPU limit of the queue according to the Glue convention # To Do: should be a utility if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \ "SI00" in self.queueDict[queueName]['ParametersDict']: maxCPUTime = float(self.queueDict[queueName] ['ParametersDict']['maxCPUTime']) # For some sites there are crazy values in the CS maxCPUTime = max(maxCPUTime, 0) maxCPUTime = min(maxCPUTime, 86400 * 12.5) si00 = float(self.queueDict[queueName] ['ParametersDict']['SI00']) queueCPUTime = 60. / 250. * maxCPUTime * si00 self.queueDict[queueName]['ParametersDict'][ 'CPUTime'] = int(queueCPUTime) qwDir = os.path.join(self.workingDirectory, queue) if not os.path.exists(qwDir): os.makedirs(qwDir) self.queueDict[queueName]['ParametersDict'][ 'WorkingDirectory'] = qwDir queueDict = dict(ceDict) queueDict.update( self.queueDict[queueName]['ParametersDict']) result = ceFactory.getCE(ceName=ce, ceType=ceType, ceParametersDict=queueDict) if not result['OK']: return result self.queueDict[queueName]['CE'] = result['Value'] self.queueDict[queueName]['CEName'] = ce self.queueDict[queueName]['CEType'] = ceType self.queueDict[queueName]['Site'] = siteName self.queueDict[queueName]['QueueName'] = queue result = self.queueDict[queueName]['CE'].isValid() if not result['OK']: self.log.fatal(result['Message']) return result if 'BundleProxy' in self.queueDict[queueName][ 'ParametersDict']: self.queueDict[queueName]['BundleProxy'] = True return S_OK()