def export_killPilot(self, pilotRefList ): """ Kill the specified pilots """ # Make a list if it is not yet pilotRefs = list( pilotRefList ) if type( pilotRefList ) in StringTypes: pilotRefs = [pilotRefList] # Regroup pilots per site and per owner pilotRefDict = {} for pilotReference in pilotRefs: result = pilotDB.getPilotInfo(pilotReference) if not result['OK'] or not result[ 'Value' ]: return S_ERROR('Failed to get info for pilot ' + pilotReference) pilotDict = result['Value'][pilotReference] owner = pilotDict['OwnerDN'] group = pilotDict['OwnerGroup'] queue = '@@@'.join( [owner, group, pilotDict['GridSite'], pilotDict['DestinationSite'], pilotDict['Queue']] ) gridType = pilotDict['GridType'] pilotRefDict.setdefault( queue, {} ) pilotRefDict[queue].setdefault( 'PilotList', [] ) pilotRefDict[queue]['PilotList'].append( pilotReference ) pilotRefDict[queue]['GridType'] = gridType # Do the work now queue by queue ceFactory = ComputingElementFactory() failed = [] for key, pilotDict in pilotRefDict.items(): owner,group,site,ce,queue = key.split( '@@@' ) result = getQueue( site, ce, queue ) if not result['OK']: return result queueDict = result['Value'] gridType = pilotDict['GridType'] result = ceFactory.getCE( gridType, ce, queueDict ) if not result['OK']: return result ce = result['Value'] if gridType in ["LCG","gLite","CREAM"]: group = getGroupOption(group,'VOMSRole',group) ret = gProxyManager.getPilotProxyFromVOMSGroup( owner, group ) if not ret['OK']: gLogger.error( ret['Message'] ) gLogger.error( 'Could not get proxy:', 'User "%s", Group "%s"' % ( owner, group ) ) return S_ERROR("Failed to get the pilot's owner proxy") proxy = ret['Value'] ce.setProxy( proxy ) pilotList = pilotDict['PilotList'] result = ce.killJob( pilotList ) if not result['OK']: failed.extend( pilotList ) if failed: return S_ERROR('Failed to kill at least some pilots') return S_OK()
def export_getPilotLoggingInfo(self, pilotReference): """ Get the pilot logging info for the Grid job reference """ result = pilotDB.getPilotInfo(pilotReference) if not result['OK'] or not result['Value']: return S_ERROR('Failed to determine owner for pilot ' + pilotReference) pilotDict = result['Value'][pilotReference] owner = pilotDict['OwnerDN'] group = pilotDict['OwnerGroup'] group = getGroupOption(group, 'VOMSRole', group) ret = gProxyManager.getPilotProxyFromVOMSGroup(owner, group) if not ret['OK']: gLogger.error(ret['Message']) gLogger.error('Could not get proxy:', 'User "%s", Group "%s"' % (owner, group)) return S_ERROR("Failed to get the pilot's owner proxy") proxy = ret['Value'] gridType = pilotDict['GridType'] return getPilotLoggingInfo(proxy, gridType, pilotReference)
def __getGridJobOutput(self,pilotReference): """ Get the pilot job standard output and standard error files for the Grid job reference """ result = pilotDB.getPilotInfo(pilotReference) if not result['OK'] or not result[ 'Value' ]: return S_ERROR('Failed to get info for pilot ' + pilotReference) pilotDict = result['Value'][pilotReference] owner = pilotDict['OwnerDN'] group = pilotDict['OwnerGroup'] # FIXME: What if the OutputSandBox is not StdOut and StdErr, what do we do with other files? result = pilotDB.getPilotOutput(pilotReference) if result['OK']: stdout = result['Value']['StdOut'] error = result['Value']['StdErr'] if stdout or error: resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = [] return S_OK(resultDict) else: return S_ERROR('Empty pilot output found') gridType = pilotDict['GridType'] if gridType in ["LCG","gLite","CREAM"]: group = getGroupOption(group,'VOMSRole',group) ret = gProxyManager.getPilotProxyFromVOMSGroup( owner, group ) if not ret['OK']: gLogger.error( ret['Message'] ) gLogger.error( 'Could not get proxy:', 'User "%s", Group "%s"' % ( owner, group ) ) return S_ERROR("Failed to get the pilot's owner proxy") proxy = ret['Value'] pilotStamp = pilotDict['PilotStamp'] result = getPilotOutput( proxy, gridType, pilotReference, pilotStamp ) if not result['OK']: return S_ERROR('Failed to get pilot output: '+result['Message']) # FIXME: What if the OutputSandBox is not StdOut and StdErr, what do we do with other files? stdout = result['StdOut'] error = result['StdErr'] fileList = result['FileList'] result = pilotDB.storePilotOutput(pilotReference,stdout,error) if not result['OK']: gLogger.error('Failed to store pilot output:',result['Message']) resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = fileList return S_OK(resultDict) else: return S_ERROR('Can not retrieve pilot output for the Grid %s ' % gridType)
def getPilotProxy(pilotDict): """Get a proxy bound to a pilot""" owner = pilotDict["OwnerDN"] group = pilotDict["OwnerGroup"] groupVOMS = getGroupOption(group, "VOMSRole", group) result = gProxyManager.getPilotProxyFromVOMSGroup(owner, groupVOMS) if not result["OK"]: gLogger.error("Could not get proxy:", 'User "%s" Group "%s" : %s' % (owner, groupVOMS, result["Message"])) return S_ERROR("Failed to get the pilot's owner proxy") proxy = result["Value"] return S_OK(proxy)
def killPilotsInQueues(pilotRefDict): """kill pilots queue by queue :params dict pilotRefDict: a dict of pilots in queues """ ceFactory = ComputingElementFactory() failed = [] for key, pilotDict in pilotRefDict.items(): owner, group, site, ce, queue = key.split("@@@") result = getQueue(site, ce, queue) if not result["OK"]: return result queueDict = result["Value"] gridType = pilotDict["GridType"] result = ceFactory.getCE(gridType, ce, queueDict) if not result["OK"]: return result ce = result["Value"] # FIXME: quite hacky. Should be either removed, or based on some flag if gridType in ["CREAM", "ARC", "Globus", "HTCondorCE"]: group = getGroupOption(group, "VOMSRole", group) ret = gProxyManager.getPilotProxyFromVOMSGroup(owner, group) if not ret["OK"]: gLogger.error( "Could not get proxy:", 'User "%s" Group "%s" : %s' % (owner, group, ret["Message"])) return S_ERROR("Failed to get the pilot's owner proxy") proxy = ret["Value"] ce.setProxy(proxy) pilotList = pilotDict["PilotList"] result = ce.killJob(pilotList) if not result["OK"]: failed.extend(pilotList) return failed
def export_getPilotLoggingInfo(self,pilotReference): """ Get the pilot logging info for the Grid job reference """ result = pilotDB.getPilotInfo(pilotReference) if not result['OK'] or not result[ 'Value' ]: return S_ERROR('Failed to determine owner for pilot ' + pilotReference) pilotDict = result['Value'][pilotReference] owner = pilotDict['OwnerDN'] group = pilotDict['OwnerGroup'] ret = gProxyManager.getPilotProxyFromVOMSGroup( owner, group ) if not ret['OK']: gLogger.error( ret['Message'] ) gLogger.error( 'Could not get proxy:', 'User "%s", Group "%s"' % ( owner, group ) ) return S_ERROR("Failed to get the pilot's owner proxy") proxy = ret['Value'] gridType = pilotDict['GridType'] return getPilotLoggingInfo( proxy, gridType, pilotReference )
def __getGridJobOutput(self,pilotReference): """ Get the pilot job standard output and standard error files for the Grid job reference """ result = pilotDB.getPilotInfo(pilotReference) if not result['OK'] or not result[ 'Value' ]: return S_ERROR('Failed to get info for pilot ' + pilotReference) pilotDict = result['Value'][pilotReference] owner = pilotDict['OwnerDN'] group = pilotDict['OwnerGroup'] # FIXME: What if the OutputSandBox is not StdOut and StdErr, what do we do with other files? result = pilotDB.getPilotOutput(pilotReference) if result['OK']: stdout = result['Value']['StdOut'] error = result['Value']['StdErr'] if stdout or error: resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = [] return S_OK(resultDict) else: gLogger.warn( 'Empty pilot output found for %s' % pilotReference ) gridType = pilotDict['GridType'] if gridType in ["LCG","gLite","CREAM"]: group = getGroupOption(group,'VOMSRole',group) ret = gProxyManager.getPilotProxyFromVOMSGroup( owner, group ) if not ret['OK']: gLogger.error( ret['Message'] ) gLogger.error( 'Could not get proxy:', 'User "%s", Group "%s"' % ( owner, group ) ) return S_ERROR("Failed to get the pilot's owner proxy") proxy = ret['Value'] pilotStamp = pilotDict['PilotStamp'] result = getPilotOutput( proxy, gridType, pilotReference, pilotStamp ) if not result['OK']: return S_ERROR('Failed to get pilot output: '+result['Message']) # FIXME: What if the OutputSandBox is not StdOut and StdErr, what do we do with other files? stdout = result['StdOut'] error = result['StdErr'] fileList = result['FileList'] if stdout: result = pilotDB.storePilotOutput(pilotReference,stdout,error) if not result['OK']: gLogger.error('Failed to store pilot output:',result['Message']) resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = fileList return S_OK(resultDict) else: # Instantiate the appropriate CE ceFactory = ComputingElementFactory() result = getQueue( pilotDict['GridSite'], pilotDict['DestinationSite'], pilotDict['Queue'] ) if not result['OK']: return result queueDict = result['Value'] result = ceFactory.getCE( gridType, pilotDict['DestinationSite'], queueDict ) if not result['OK']: return result ce = result['Value'] pilotStamp = pilotDict['PilotStamp'] pRef = pilotReference if pilotStamp: pRef = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRef ) if not result['OK']: return result stdout,error = result['Value'] if stdout: result = pilotDB.storePilotOutput(pilotReference,stdout,error) if not result['OK']: gLogger.error('Failed to store pilot output:',result['Message']) resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = [] return S_OK( resultDict )
def export_killPilot(self, pilotRefList): """ Kill the specified pilots """ # Make a list if it is not yet pilotRefs = list(pilotRefList) if type(pilotRefList) in StringTypes: pilotRefs = [pilotRefList] # Regroup pilots per site and per owner pilotRefDict = {} for pilotReference in pilotRefs: result = pilotDB.getPilotInfo(pilotReference) if not result['OK'] or not result['Value']: return S_ERROR('Failed to get info for pilot ' + pilotReference) pilotDict = result['Value'][pilotReference] owner = pilotDict['OwnerDN'] group = pilotDict['OwnerGroup'] queue = '@@@'.join([ owner, group, pilotDict['GridSite'], pilotDict['DestinationSite'], pilotDict['Queue'] ]) gridType = pilotDict['GridType'] pilotRefDict.setdefault(queue, {}) pilotRefDict[queue].setdefault('PilotList', []) pilotRefDict[queue]['PilotList'].append(pilotReference) pilotRefDict[queue]['GridType'] = gridType # Do the work now queue by queue ceFactory = ComputingElementFactory() failed = [] for key, pilotDict in pilotRefDict.items(): owner, group, site, ce, queue = key.split('@@@') result = getQueue(site, ce, queue) if not result['OK']: return result queueDict = result['Value'] gridType = pilotDict['GridType'] result = ceFactory.getCE(gridType, ce, queueDict) if not result['OK']: return result ce = result['Value'] if gridType in ["LCG", "gLite", "CREAM"]: group = getGroupOption(group, 'VOMSRole', group) ret = gProxyManager.getPilotProxyFromVOMSGroup(owner, group) if not ret['OK']: gLogger.error(ret['Message']) gLogger.error('Could not get proxy:', 'User "%s", Group "%s"' % (owner, group)) return S_ERROR("Failed to get the pilot's owner proxy") proxy = ret['Value'] ce.setProxy(proxy) pilotList = pilotDict['PilotList'] result = ce.killJob(pilotList) if not result['OK']: failed.extend(pilotList) if failed: return S_ERROR('Failed to kill at least some pilots') return S_OK()
def __getGridJobOutput(self, pilotReference): """ Get the pilot job standard output and standard error files for the Grid job reference """ result = pilotDB.getPilotInfo(pilotReference) if not result['OK'] or not result['Value']: return S_ERROR('Failed to get info for pilot ' + pilotReference) pilotDict = result['Value'][pilotReference] owner = pilotDict['OwnerDN'] group = pilotDict['OwnerGroup'] # FIXME: What if the OutputSandBox is not StdOut and StdErr, what do we do with other files? result = pilotDB.getPilotOutput(pilotReference) if result['OK']: stdout = result['Value']['StdOut'] error = result['Value']['StdErr'] if stdout or error: resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = [] return S_OK(resultDict) else: gLogger.warn('Empty pilot output found for %s' % pilotReference) gridType = pilotDict['GridType'] if gridType in ["LCG", "gLite", "CREAM"]: group = getGroupOption(group, 'VOMSRole', group) ret = gProxyManager.getPilotProxyFromVOMSGroup(owner, group) if not ret['OK']: gLogger.error(ret['Message']) gLogger.error('Could not get proxy:', 'User "%s", Group "%s"' % (owner, group)) return S_ERROR("Failed to get the pilot's owner proxy") proxy = ret['Value'] pilotStamp = pilotDict['PilotStamp'] result = getPilotOutput(proxy, gridType, pilotReference, pilotStamp) if not result['OK']: return S_ERROR('Failed to get pilot output: ' + result['Message']) # FIXME: What if the OutputSandBox is not StdOut and StdErr, what do we do with other files? stdout = result['StdOut'] error = result['StdErr'] fileList = result['FileList'] if stdout: result = pilotDB.storePilotOutput(pilotReference, stdout, error) if not result['OK']: gLogger.error('Failed to store pilot output:', result['Message']) resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = fileList return S_OK(resultDict) else: # Instantiate the appropriate CE ceFactory = ComputingElementFactory() result = getQueue(pilotDict['GridSite'], pilotDict['DestinationSite'], pilotDict['Queue']) if not result['OK']: return result queueDict = result['Value'] result = ceFactory.getCE(gridType, pilotDict['DestinationSite'], queueDict) if not result['OK']: return result ce = result['Value'] pilotStamp = pilotDict['PilotStamp'] pRef = pilotReference if pilotStamp: pRef = pRef + ':::' + pilotStamp result = ce.getJobOutput(pRef) if not result['OK']: return result stdout, error = result['Value'] if stdout: result = pilotDB.storePilotOutput(pilotReference, stdout, error) if not result['OK']: gLogger.error('Failed to store pilot output:', result['Message']) resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = [] return S_OK(resultDict)
def getJobOutput( self, jobID, _localDir = None ): """ Get the specified job standard output and error files. The output is returned as strings. """ if jobID.find( ':::' ) != -1: pilotRef, stamp = jobID.split( ':::' ) else: pilotRef = jobID stamp = '' if not stamp: return S_ERROR( 'Pilot stamp not defined for %s' % pilotRef ) ## somehow when this is called from the WMSAdministrator we don't ## get the right proxy, so we do all this stuff here now. Probably ## should be fixed in the WMSAdministrator? ## Because this function is called from the WMSAdminsitrator, the ## gridEnv that is picked up is not the one from the SiteDirector ## Definition, but from Computing/CEDefaults result = PilotAgentsDB().getPilotInfo(pilotRef) if not result['OK'] or not result[ 'Value' ]: return S_ERROR('Failed to determine owner for pilot ' + pilotRef) pilotDict = result['Value'][pilotRef] owner = pilotDict['OwnerDN'] group = getGroupOption(pilotDict['OwnerGroup'],'VOMSRole',pilotDict['OwnerGroup']) ret = gProxyManager.getPilotProxyFromVOMSGroup( owner, group ) if not ret['OK']: self.log.error( ret['Message'] ) self.log.error( 'Could not get proxy:', 'User "%s", Group "%s"' % ( owner, group ) ) return S_ERROR("Failed to get the pilot's owner proxy") self.proxy = ret['Value'] self.log.verbose("Getting output for: %s " % pilotRef) cmd = ['globus-job-get-output', '-out', pilotRef ] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) output = '' if result['OK']: if not result['Value'][0]: output = result['Value'][1] elif result['Value'][0] == 1 and "No such file or directory" in result['Value'][2]: output = "Standard Output is not available on the Globus service" else: error = '\n'.join( result['Value'][1:] ) return S_ERROR( error ) else: return S_ERROR( 'Failed to retrieve output for %s' % jobID ) cmd = ['globus-job-get-output', '-err', pilotRef ] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) error = '' if result['OK']: if not result['Value'][0]: error = result['Value'][1] elif result['Value'][0] == 1 and "No such file or directory" in result['Value'][2]: error = "Standard Error is not available on the Globus service" else: error = '\n'.join( result['Value'][1:] ) return S_ERROR( error ) else: return S_ERROR( 'Failed to retrieve error for %s' % jobID ) return S_OK( ( output, error ) )
def getJobOutput(self, jobID, _localDir=None): """ Get the specified job standard output and error files. The output is returned as strings. """ if jobID.find(':::') != -1: pilotRef, stamp = jobID.split(':::') else: pilotRef = jobID stamp = '' if not stamp: return S_ERROR('Pilot stamp not defined for %s' % pilotRef) # somehow when this is called from the WMSAdministrator we don't # get the right proxy, so we do all this stuff here now. Probably # should be fixed in the WMSAdministrator? # Because this function is called from the WMSAdminsitrator, the # gridEnv that is picked up is not the one from the SiteDirector # Definition, but from Computing/CEDefaults result = PilotAgentsDB().getPilotInfo(pilotRef) if not result['OK'] or not result['Value']: return S_ERROR('Failed to determine owner for pilot ' + pilotRef) pilotDict = result['Value'][pilotRef] owner = pilotDict['OwnerDN'] group = getGroupOption(pilotDict['OwnerGroup'], 'VOMSRole', pilotDict['OwnerGroup']) ret = gProxyManager.getPilotProxyFromVOMSGroup(owner, group) if not ret['OK']: self.log.error(ret['Message']) self.log.error('Could not get proxy:', 'User "%s", Group "%s"' % (owner, group)) return S_ERROR("Failed to get the pilot's owner proxy") self.proxy = ret['Value'] self.log.verbose("Getting output for: %s " % pilotRef) cmd = ['globus-job-get-output', '-out', pilotRef] result = executeGridCommand(self.proxy, cmd, self.gridEnv) output = '' if result['OK']: if not result['Value'][0]: output = result['Value'][1] elif result['Value'][0] == 1 and "No such file or directory" in result['Value'][2]: output = "Standard Output is not available on the Globus service" else: error = '\n'.join(result['Value'][1:]) return S_ERROR(error) else: return S_ERROR('Failed to retrieve output for %s' % jobID) cmd = ['globus-job-get-output', '-err', pilotRef] result = executeGridCommand(self.proxy, cmd, self.gridEnv) error = '' if result['OK']: if not result['Value'][0]: error = result['Value'][1] elif result['Value'][0] == 1 and "No such file or directory" in result['Value'][2]: error = "Standard Error is not available on the Globus service" else: error = '\n'.join(result['Value'][1:]) return S_ERROR(error) else: return S_ERROR('Failed to retrieve error for %s' % jobID) return S_OK((output, error))
def getGridJobOutput(pilotReference): """ Get the pilot job standard output and standard error files for the Grid job reference :param str pilotReference: a grid (job) pilot reference """ result = pilotAgentsDB.getPilotInfo(pilotReference) if not result['OK'] or not result['Value']: return S_ERROR('Failed to get info for pilot ' + pilotReference) pilotDict = result['Value'][pilotReference] owner = pilotDict['OwnerDN'] group = pilotDict['OwnerGroup'] # FIXME: What if the OutputSandBox is not StdOut and StdErr, what do we do with other files? result = pilotAgentsDB.getPilotOutput(pilotReference) if result['OK']: stdout = result['Value']['StdOut'] error = result['Value']['StdErr'] if stdout or error: resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = [] return S_OK(resultDict) else: gLogger.warn('Empty pilot output found', 'for %s' % pilotReference) # Instantiate the appropriate CE ceFactory = ComputingElementFactory() result = getQueue(pilotDict['GridSite'], pilotDict['DestinationSite'], pilotDict['Queue']) if not result['OK']: return result queueDict = result['Value'] gridEnv = getGridEnv() queueDict['GridEnv'] = gridEnv queueDict['WorkingDirectory'] = mkdtemp() result = ceFactory.getCE(pilotDict['GridType'], pilotDict['DestinationSite'], queueDict) if not result['OK']: shutil.rmtree(queueDict['WorkingDirectory']) return result ce = result['Value'] groupVOMS = getGroupOption(group, 'VOMSRole', group) result = gProxyManager.getPilotProxyFromVOMSGroup(owner, groupVOMS) if not result['OK']: gLogger.error( 'Could not get proxy:', 'User "%s" Group "%s" : %s' % (owner, groupVOMS, result['Message'])) return S_ERROR("Failed to get the pilot's owner proxy") proxy = result['Value'] ce.setProxy(proxy) pilotStamp = pilotDict['PilotStamp'] pRef = pilotReference if pilotStamp: pRef = pRef + ':::' + pilotStamp result = ce.getJobOutput(pRef) if not result['OK']: shutil.rmtree(queueDict['WorkingDirectory']) return result stdout, error = result['Value'] if stdout: result = pilotAgentsDB.storePilotOutput(pilotReference, stdout, error) if not result['OK']: gLogger.error('Failed to store pilot output:', result['Message']) resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = [] shutil.rmtree(queueDict['WorkingDirectory']) return S_OK(resultDict)
def execute(self): """The PilotAgent execution method. """ self.pilotStalledDays = self.am_getOption('PilotStalledDays', 3) self.gridEnv = self.am_getOption('GridEnv') if not self.gridEnv: # No specific option found, try a general one setup = gConfig.getValue('/DIRAC/Setup', '') if setup: instance = gConfig.getValue( '/DIRAC/Setups/%s/WorkloadManagement' % setup, '') if instance: self.gridEnv = gConfig.getValue( '/Systems/WorkloadManagement/%s/GridEnv' % instance, '') result = self.pilotDB._getConnection() if result['OK']: connection = result['Value'] else: return result result = self.pilotDB.getPilotGroups(self.identityFieldsList, {'Status': self.queryStateList}) if not result['OK']: self.log.error('Fail to get identities Groups', result['Message']) return result if not result['Value']: return S_OK() pilotsToAccount = {} for ownerDN, ownerGroup, gridType, broker in result['Value']: if not gridType in self.eligibleGridTypes: continue self.log.verbose('Getting pilots for %s:%s @ %s %s' % (ownerDN, ownerGroup, gridType, broker)) condDict1 = { 'Status': 'Done', 'StatusReason': 'Report from JobAgent', 'OwnerDN': ownerDN, 'OwnerGroup': ownerGroup, 'GridType': gridType, 'Broker': broker } condDict2 = { 'Status': self.queryStateList, 'OwnerDN': ownerDN, 'OwnerGroup': ownerGroup, 'GridType': gridType, 'Broker': broker } for condDict in [condDict1, condDict2]: result = self.clearWaitingPilots(condDict) if not result['OK']: self.log.warn('Failed to clear Waiting Pilot Jobs') result = self.pilotDB.selectPilots(condDict) if not result['OK']: self.log.warn('Failed to get the Pilot Agents') return result if not result['Value']: continue refList = result['Value'] ret = gProxyManager.getPilotProxyFromVOMSGroup( ownerDN, ownerGroup) if not ret['OK']: self.log.error(ret['Message']) self.log.error( 'Could not get proxy:', 'User "%s", Group "%s"' % (ownerDN, ownerGroup)) continue proxy = ret['Value'] self.log.verbose( "Getting status for %s pilots for owner %s and group %s" % (len(refList), ownerDN, ownerGroup)) for start_index in range(0, len(refList), MAX_JOBS_QUERY): refsToQuery = refList[start_index:start_index + MAX_JOBS_QUERY] self.log.verbose( 'Querying %d pilots of %s starting at %d' % (len(refsToQuery), len(refList), start_index)) result = self.getPilotStatus(proxy, gridType, refsToQuery) if not result['OK']: if result['Message'] == 'Broker not Available': self.log.error('Broker %s not Available' % broker) break self.log.warn('Failed to get pilot status:') self.log.warn('%s:%s @ %s' % (ownerDN, ownerGroup, gridType)) continue statusDict = result['Value'] for pRef in statusDict: pDict = statusDict[pRef] if pDict: if pDict['isParent']: self.log.verbose('Clear parametric parent %s' % pRef) result = self.clearParentJob( pRef, pDict, connection) if not result['OK']: self.log.warn(result['Message']) else: self.log.info( 'Parameteric parent removed: %s' % pRef) if pDict['FinalStatus']: self.log.verbose( 'Marking Status for %s to %s' % (pRef, pDict['Status'])) pilotsToAccount[pRef] = pDict else: self.log.verbose( 'Setting Status for %s to %s' % (pRef, pDict['Status'])) result = self.pilotDB.setPilotStatus( pRef, pDict['Status'], pDict['DestinationSite'], updateTime=pDict['StatusDate'], conn=connection) if len(pilotsToAccount) > 100: self.accountPilots(pilotsToAccount, connection) pilotsToAccount = {} self.accountPilots(pilotsToAccount, connection) # Now handle pilots not updated in the last N days (most likely the Broker is no # longer available) and declare them Deleted. result = self.handleOldPilots(connection) connection.close() return S_OK()
def execute( self ): """The PilotAgent execution method. """ self.pilotStalledDays = self.am_getOption( 'PilotStalledDays', 3 ) self.gridEnv = self.am_getOption( 'GridEnv' ) if not self.gridEnv: # No specific option found, try a general one setup = gConfig.getValue( '/DIRAC/Setup', '' ) if setup: instance = gConfig.getValue( '/DIRAC/Setups/%s/WorkloadManagement' % setup, '' ) if instance: self.gridEnv = gConfig.getValue( '/Systems/WorkloadManagement/%s/GridEnv' % instance, '' ) result = self.pilotDB._getConnection() if result['OK']: connection = result['Value'] else: return result result = self.pilotDB.getPilotGroups( self.identityFieldsList, {'Status': self.queryStateList } ) if not result['OK']: self.log.error( 'Fail to get identities Groups', result['Message'] ) return result if not result['Value']: return S_OK() pilotsToAccount = {} for ownerDN, ownerGroup, gridType, broker in result['Value']: if not gridType in self.eligibleGridTypes: continue self.log.verbose( 'Getting pilots for %s:%s @ %s %s' % ( ownerDN, ownerGroup, gridType, broker ) ) condDict1 = {'Status':'Done', 'StatusReason':'Report from JobAgent', 'OwnerDN':ownerDN, 'OwnerGroup':ownerGroup, 'GridType':gridType, 'Broker':broker} condDict2 = {'Status':self.queryStateList, 'OwnerDN':ownerDN, 'OwnerGroup':ownerGroup, 'GridType':gridType, 'Broker':broker} for condDict in [ condDict1, condDict2]: result = self.clearWaitingPilots( condDict ) if not result['OK']: self.log.warn( 'Failed to clear Waiting Pilot Jobs' ) result = self.pilotDB.selectPilots( condDict ) if not result['OK']: self.log.warn( 'Failed to get the Pilot Agents' ) return result if not result['Value']: continue refList = result['Value'] ret = gProxyManager.getPilotProxyFromVOMSGroup( ownerDN, ownerGroup ) if not ret['OK']: self.log.error( ret['Message'] ) self.log.error( 'Could not get proxy:', 'User "%s", Group "%s"' % ( ownerDN, ownerGroup ) ) continue proxy = ret['Value'] self.log.verbose( "Getting status for %s pilots for owner %s and group %s" % ( len( refList ), ownerDN, ownerGroup ) ) for start_index in range( 0, len( refList ), MAX_JOBS_QUERY ): refsToQuery = refList[ start_index : start_index + MAX_JOBS_QUERY ] self.log.verbose( 'Querying %d pilots of %s starting at %d' % ( len( refsToQuery ), len( refList ), start_index ) ) result = self.getPilotStatus( proxy, gridType, refsToQuery ) if not result['OK']: if result['Message'] == 'Broker not Available': self.log.error( 'Broker %s not Available' % broker ) break self.log.warn( 'Failed to get pilot status:' ) self.log.warn( '%s:%s @ %s' % ( ownerDN, ownerGroup, gridType ) ) continue statusDict = result[ 'Value' ] for pRef in statusDict: pDict = statusDict[ pRef ] if pDict: if pDict['isParent']: self.log.verbose( 'Clear parametric parent %s' % pRef ) result = self.clearParentJob( pRef, pDict, connection ) if not result['OK']: self.log.warn( result['Message'] ) else: self.log.info( 'Parametric parent removed: %s' % pRef ) if pDict[ 'FinalStatus' ]: self.log.verbose( 'Marking Status for %s to %s' % ( pRef, pDict['Status'] ) ) pilotsToAccount[ pRef ] = pDict else: self.log.verbose( 'Setting Status for %s to %s' % ( pRef, pDict['Status'] ) ) result = self.pilotDB.setPilotStatus( pRef, pDict['Status'], pDict['DestinationSite'], updateTime = pDict['StatusDate'], conn = connection ) if len( pilotsToAccount ) > 100: self.accountPilots( pilotsToAccount, connection ) pilotsToAccount = {} self.accountPilots( pilotsToAccount, connection ) # Now handle pilots not updated in the last N days (most likely the Broker is no # longer available) and declare them Deleted. result = self.handleOldPilots( connection ) connection.close() return S_OK()
def __getGridJobOutput(self, pilotReference): """ Get the pilot job standard output and standard error files for the Grid job reference """ result = pilotDB.getPilotInfo(pilotReference) if not result['OK'] or not result['Value']: return S_ERROR('Failed to get info for pilot ' + pilotReference) pilotDict = result['Value'][pilotReference] owner = pilotDict['OwnerDN'] group = pilotDict['OwnerGroup'] # FIXME: What if the OutputSandBox is not StdOut and StdErr, what do we do with other files? result = pilotDB.getPilotOutput(pilotReference) if result['OK']: stdout = result['Value']['StdOut'] error = result['Value']['StdErr'] if stdout or error: resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = [] return S_OK(resultDict) else: return S_ERROR('Empty pilot output found') gridType = pilotDict['GridType'] if gridType in ["LCG", "gLite", "CREAM"]: group = getGroupOption(group, 'VOMSRole', group) ret = gProxyManager.getPilotProxyFromVOMSGroup(owner, group) if not ret['OK']: gLogger.error(ret['Message']) gLogger.error('Could not get proxy:', 'User "%s", Group "%s"' % (owner, group)) return S_ERROR("Failed to get the pilot's owner proxy") proxy = ret['Value'] pilotStamp = pilotDict['PilotStamp'] result = getPilotOutput(proxy, gridType, pilotReference, pilotStamp) if not result['OK']: return S_ERROR('Failed to get pilot output: ' + result['Message']) # FIXME: What if the OutputSandBox is not StdOut and StdErr, what do we do with other files? stdout = result['StdOut'] error = result['StdErr'] fileList = result['FileList'] result = pilotDB.storePilotOutput(pilotReference, stdout, error) if not result['OK']: gLogger.error('Failed to store pilot output:', result['Message']) resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = fileList return S_OK(resultDict) else: return S_ERROR('Can not retrieve pilot output for the Grid %s ' % gridType)