def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) self.vmTypeDict = {} self.vmTypeCECache = {} self.vmTypeSlots = {} self.failedVMTypes = defaultdict(int) self.firstPass = True self.vo = '' self.group = '' # self.voGroups contain all the eligible user groups for clouds submitted by this SiteDirector self.voGroups = [] self.cloudDN = '' self.cloudGroup = '' self.platforms = [] self.sites = [] self.wmsClient = WMSAdministratorClient() self.proxy = None self.updateStatus = True self.getOutput = False self.sendAccounting = True
def __init__(self, args=None, clients=None): super(JobsWMSCommand, self).__init__(args, clients) if "WMSAdministrator" in self.apis: self.wmsAdmin = self.apis["WMSAdministrator"] else: self.wmsAdmin = WMSAdministratorClient()
def getSiteStatuses(self, siteNames=None): """ Method that queries the database for status of the sites in a given list. A single string site name may also be provides as "siteNames" If the input is None, it is interpreted as * ( all ). If match is positive, the output looks like:: { 'test1.test1.org': 'Active', 'test2.test2.org': 'Banned', } Examples:: >>> siteStatus.getSiteStatuses( ['test1.test1.uk', 'test2.test2.net', 'test3.test3.org'] ) S_OK( { 'test1.test1.org': 'Active', 'test2.test2.net': 'Banned', 'test3.test3.org': 'Active' } ) >>> siteStatus.getSiteStatuses( 'NotExists') S_ERROR( ... )) >>> siteStatus.getSiteStatuses( None ) S_OK( { 'test1.test1.org': 'Active', 'test2.test2.net': 'Banned', }, ... } ) :param siteNames: name(s) of the sites to be matched :type siteNames: list, str :return: S_OK() || S_ERROR() """ if self.rssFlag: return self.__getRSSSiteStatus(siteNames) else: siteStatusDict = {} wmsAdmin = WMSAdministratorClient() if siteNames: if isinstance(siteNames, basestring): siteNames = [siteNames] for siteName in siteNames: result = wmsAdmin.getSiteMaskStatus(siteName) if not result['OK']: return result else: siteStatusDict[siteName] = result['Value'] else: result = wmsAdmin.getSiteMaskStatus() if not result['OK']: return result else: siteStatusDict = result['Value'] return S_OK(siteStatusDict)
def getSiteStatuses(self, siteNames=None): """ Method that queries the database for status of the sites in a given list. A single string site name may also be provides as "siteNames" If the input is None, it is interpreted as * ( all ). If match is positive, the output looks like:: { 'test1.test1.org': 'Active', 'test2.test2.org': 'Banned', } Examples:: >>> siteStatus.getSiteStatuses( ['test1.test1.uk', 'test2.test2.net', 'test3.test3.org'] ) S_OK( { 'test1.test1.org': 'Active', 'test2.test2.net': 'Banned', 'test3.test3.org': 'Active' } ) >>> siteStatus.getSiteStatuses( 'NotExists') S_ERROR( ... )) >>> siteStatus.getSiteStatuses( None ) S_OK( { 'test1.test1.org': 'Active', 'test2.test2.net': 'Banned', }, ... } ) :param siteNames: name(s) of the sites to be matched :type siteNames: list, str :return: S_OK() || S_ERROR() """ if self.rssFlag: return self.__getRSSSiteStatus(siteNames) else: siteStatusDict = {} wmsAdmin = WMSAdministratorClient() if siteNames: if isinstance(siteNames, six.string_types): siteNames = [siteNames] for siteName in siteNames: result = wmsAdmin.getSiteMaskStatus(siteName) if not result['OK']: return result else: siteStatusDict[siteName] = result['Value'] else: result = wmsAdmin.getSiteMaskStatus() if not result['OK']: return result else: siteStatusDict = result['Value'] return S_OK(siteStatusDict)
def __init__(self, args=None, clients=None): super(PilotCommand, self).__init__(args, clients) if 'WMSAdministrator' in self.apis: self.wmsAdmin = self.apis['WMSAdministrator'] else: self.wmsAdmin = WMSAdministratorClient() if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis['ResourceManagementClient'] else: self.rmClient = ResourceManagementClient()
def __init__(self, args=None, clients=None): super(JobCommand, self).__init__(args, clients) if "WMSAdministrator" in self.apis: self.wmsAdmin = self.apis["WMSAdministrator"] else: self.wmsAdmin = WMSAdministratorClient() if "ResourceManagementClient" in self.apis: self.rmClient = self.apis["ResourceManagementClient"] else: self.rmClient = ResourceManagementClient()
def initSites(): ''' Initializes Sites statuses taking their values from the "SiteMask" table of "JobDB" database. ''' rssClient = ResourceStatusClient.ResourceStatusClient() sites = WMSAdministratorClient().getAllSiteMaskStatus() if not sites['OK']: subLogger.error(sites['Message']) DIRACExit(1) for site, elements in sites['Value'].iteritems(): result = rssClient.addOrModifyStatusElement( "Site", "Status", name=site, statusType='all', status=elements[0], elementType=site.split('.')[0], tokenOwner='rs_svc', reason='dirac-rss-sync') if not result['OK']: subLogger.error(result['Message']) DIRACExit(1) return S_OK()
def initialize(self): """Sets defaults """ self.am_setOption('PollingTime', 120) self.am_setOption('GridEnv', '') self.am_setOption('PilotStalledDays', 3) self.pilotDB = PilotAgentsDB() self.diracadmin = DiracAdmin() self.jobDB = JobDB() self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30) self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay', 7) self.WMSAdministrator = WMSAdministratorClient() return S_OK()
def banSite(self, site, comment, printOutput=False): """Removes the site from the site mask. Example usage: >>> gLogger.notice(diracAdmin.banSite()) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result mask = self.getSiteMask(status='Banned') if not mask['OK']: return mask siteMask = mask['Value'] if site in siteMask: if printOutput: gLogger.notice('Site %s is already Banned' % site) return S_OK('Site %s is already Banned' % site) if self.rssFlag: result = self.sitestatus.setSiteStatus(site, 'Banned', comment) else: result = WMSAdministratorClient().banSite(site, comment) if not result['OK']: return result if printOutput: gLogger.notice('Site %s status is set to Banned' % site) return result
def initSites(): """ Initializes Sites statuses taking their values from the "SiteMask" table of "JobDB" database. """ from DIRAC.WorkloadManagementSystem.Client.WMSAdministratorClient import WMSAdministratorClient from DIRAC.ResourceStatusSystem.Client import ResourceStatusClient rssClient = ResourceStatusClient.ResourceStatusClient() sites = WMSAdministratorClient().getAllSiteMaskStatus() if not sites["OK"]: subLogger.error(sites["Message"]) DIRACExit(1) for site, elements in sites["Value"].items(): result = rssClient.addOrModifyStatusElement( "Site", "Status", name=site, statusType="all", status=elements[0], elementType=site.split(".")[0], tokenOwner="rs_svc", reason="dirac-rss-sync", ) if not result["OK"]: subLogger.error(result["Message"]) DIRACExit(1) return S_OK()
def allowSite(self, site, comment, printOutput=False): """Adds the site to the site mask. Example usage: >>> gLogger.notice(diracAdmin.allowSite()) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result result = self.getSiteMask(status='Active') if not result['OK']: return result siteMask = result['Value'] if site in siteMask: if printOutput: gLogger.notice('Site %s is already Active' % site) return S_OK('Site %s is already Active' % site) if self.rssFlag: result = self.sitestatus.setSiteStatus(site, 'Active', comment) else: result = WMSAdministratorClient().allowSite(site, comment) if not result['OK']: return result if printOutput: gLogger.notice('Site %s status is set to Active' % site) return result
def initialize(self): """ Define the commands to be executed, and instantiate the clients that will be used. """ res = ObjectLoader().loadObject('DIRAC.ResourceStatusSystem.Client.ResourceStatusClient', 'ResourceStatusClient') if not res['OK']: self.log.error('Failed to load ResourceStatusClient class: %s' % res['Message']) return res rsClass = res['Value'] res = ObjectLoader().loadObject('DIRAC.ResourceStatusSystem.Client.ResourceManagementClient', 'ResourceManagementClient') if not res['OK']: self.log.error('Failed to load ResourceManagementClient class: %s' % res['Message']) return res rmClass = res['Value'] self.commands['Downtime'] = [{'Downtime': {}}] self.commands['GOCDBSync'] = [{'GOCDBSync': {}}] self.commands['FreeDiskSpace'] = [{'FreeDiskSpace': {}}] # PilotsCommand # self.commands[ 'Pilots' ] = [ # { 'PilotsWMS' : { 'element' : 'Site', 'siteName' : None } }, # { 'PilotsWMS' : { 'element' : 'Resource', 'siteName' : None } } # ] # FIXME: do not forget about hourly vs Always ...etc # AccountingCacheCommand # self.commands[ 'AccountingCache' ] = [ # {'SuccessfullJobsBySiteSplitted' :{'hours' :24, 'plotType' :'Job' }}, # {'FailedJobsBySiteSplitted' :{'hours' :24, 'plotType' :'Job' }}, # {'SuccessfullPilotsBySiteSplitted' :{'hours' :24, 'plotType' :'Pilot' }}, # {'FailedPilotsBySiteSplitted' :{'hours' :24, 'plotType' :'Pilot' }}, # {'SuccessfullPilotsByCESplitted' :{'hours' :24, 'plotType' :'Pilot' }}, # {'FailedPilotsByCESplitted' :{'hours' :24, 'plotType' :'Pilot' }}, # {'RunningJobsBySiteSplitted' :{'hours' :24, 'plotType' :'Job' }}, # # {'RunningJobsBySiteSplitted' :{'hours' :168, 'plotType' :'Job' }}, # # {'RunningJobsBySiteSplitted' :{'hours' :720, 'plotType' :'Job' }}, # # {'RunningJobsBySiteSplitted' :{'hours' :8760, 'plotType' :'Job' }}, # ] # VOBOXAvailability # self.commands[ 'VOBOXAvailability' ] = [ # { 'VOBOXAvailability' : {} } # # Reuse clients for the commands self.clients['GOCDBClient'] = GOCDBClient() self.clients['ReportsClient'] = ReportsClient() self.clients['ResourceStatusClient'] = rsClass() self.clients['ResourceManagementClient'] = rmClass() self.clients['WMSAdministrator'] = WMSAdministratorClient() self.clients['Pilots'] = PilotManagerClient() self.cCaller = CommandCaller return S_OK()
def getPilotOutput(self, gridReference, directory=''): """Retrieve the pilot output (std.out and std.err) for an existing job in the WMS. >>> print dirac.getJobPilotOutput(12345) {'OK': True, 'Value': {}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) result = WMSAdministratorClient().getPilotOutput(gridReference) if not result['OK']: return result gridReferenceSmall = gridReference.split('/')[-1] if not gridReferenceSmall: gridReferenceSmall = 'reference' outputPath = '%s/pilot_%s' % (directory, gridReferenceSmall) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if 'StdOut' in outputs: stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.info('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if 'StdErr' in outputs: stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdErr']) self.log.info('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result
def __init__(self, args=None, clients=None): super(PilotCommand, self).__init__(args, clients) if 'WMSAdministrator' in self.apis: self.wmsAdmin = self.apis['WMSAdministrator'] else: self.wmsAdmin = WMSAdministratorClient() if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis['ResourceManagementClient'] else: self.rmClient = ResourceManagementClient()
def finalize(self): """ Job Agent finalization method """ gridCE = gConfig.getValue('/LocalSite/GridCE', '') queue = gConfig.getValue('/LocalSite/CEQueue', '') result = WMSAdministratorClient().setPilotStatus( str(self.pilotReference), 'Done', gridCE, 'Report from JobAgent', self.siteName, queue) if not result['OK']: self.log.warn(result['Message']) return S_OK()
def initialize(self): """ Define the commands to be executed, and instantiate the clients that will be used. """ self.am_setOption('shifterProxy', 'DataManager') self.rmClient = ResourceManagementClient() self.commands['Downtime'] = [{'Downtime': {}}] self.commands['GOCDBSync'] = [{'GOCDBSync': {}}] self.commands['FreeDiskSpace'] = [{'FreeDiskSpace': {}}] # PilotsCommand # self.commands[ 'Pilots' ] = [ # { 'PilotsWMS' : { 'element' : 'Site', 'siteName' : None } }, # { 'PilotsWMS' : { 'element' : 'Resource', 'siteName' : None } } # ] # FIXME: do not forget about hourly vs Always ...etc # AccountingCacheCommand # self.commands[ 'AccountingCache' ] = [ # {'SuccessfullJobsBySiteSplitted' :{'hours' :24, 'plotType' :'Job' }}, # {'FailedJobsBySiteSplitted' :{'hours' :24, 'plotType' :'Job' }}, # {'SuccessfullPilotsBySiteSplitted' :{'hours' :24, 'plotType' :'Pilot' }}, # {'FailedPilotsBySiteSplitted' :{'hours' :24, 'plotType' :'Pilot' }}, # {'SuccessfullPilotsByCESplitted' :{'hours' :24, 'plotType' :'Pilot' }}, # {'FailedPilotsByCESplitted' :{'hours' :24, 'plotType' :'Pilot' }}, # {'RunningJobsBySiteSplitted' :{'hours' :24, 'plotType' :'Job' }}, # # {'RunningJobsBySiteSplitted' :{'hours' :168, 'plotType' :'Job' }}, # # {'RunningJobsBySiteSplitted' :{'hours' :720, 'plotType' :'Job' }}, # # {'RunningJobsBySiteSplitted' :{'hours' :8760, 'plotType' :'Job' }}, # ] # VOBOXAvailability # self.commands[ 'VOBOXAvailability' ] = [ # { 'VOBOXAvailability' : {} } # # Reuse clients for the commands self.clients['GOCDBClient'] = GOCDBClient() self.clients['ReportGenerator'] = RPCClient( 'Accounting/ReportGenerator') self.clients['ReportsClient'] = ReportsClient() self.clients['ResourceStatusClient'] = ResourceStatusClient() self.clients['ResourceManagementClient'] = ResourceManagementClient() self.clients['WMSAdministrator'] = WMSAdministratorClient() self.cCaller = CommandCaller return S_OK()
def killPilot(self, gridReference): """Kill the pilot specified >>> print dirac.getPilotInfo(12345) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') result = WMSAdministratorClient().killPilot(gridReference) return result
def getJobPilotOutput(self, jobID, directory=""): """Retrieve the pilot output for an existing job in the WMS. The output will be retrieved in a local directory unless otherwise specified. >>> gLogger.notice(dirac.getJobPilotOutput(12345)) {'OK': True, StdOut:'',StdError:''} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport("Directory %s does not exist" % directory) result = WMSAdministratorClient().getJobPilotOutput(jobID) if not result["OK"]: return result outputPath = "%s/pilot_%s" % (directory, jobID) if os.path.exists(outputPath): self.log.info("Remove %s and retry to continue" % outputPath) return S_ERROR("Remove %s and retry to continue" % outputPath) if not os.path.exists(outputPath): self.log.verbose("Creating directory %s" % outputPath) os.mkdir(outputPath) outputs = result["Value"] if "StdOut" in outputs: stdout = "%s/std.out" % (outputPath) with open(stdout, "w") as fopen: fopen.write(outputs["StdOut"]) self.log.verbose("Standard output written to %s" % (stdout)) else: self.log.warn("No standard output returned") if "StdError" in outputs: stderr = "%s/std.err" % (outputPath) with open(stderr, "w") as fopen: fopen.write(outputs["StdError"]) self.log.verbose("Standard error written to %s" % (stderr)) else: self.log.warn("No standard error returned") self.log.always("Outputs retrieved in %s" % outputPath) return result
def getPilotLoggingInfo(self, gridReference): """Retrieve the pilot logging info for an existing job in the WMS. >>> print dirac.getPilotLoggingInfo(12345) {'OK': True, 'Value': {"The output of the command"}} :param gridReference: Gridp pilot job reference Id :type gridReference: string :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') return WMSAdministratorClient().getPilotLoggingInfo(gridReference)
def getJobPilotOutput(self, jobID, directory=''): """Retrieve the pilot output for an existing job in the WMS. The output will be retrieved in a local directory unless otherwise specified. >>> gLogger.notice(dirac.getJobPilotOutput(12345)) {'OK': True, StdOut:'',StdError:''} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) result = WMSAdministratorClient().getJobPilotOutput(jobID) if not result['OK']: return result outputPath = '%s/pilot_%s' % (directory, jobID) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if 'StdOut' in outputs: stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.verbose('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if 'StdError' in outputs: stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdError']) self.log.verbose('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result
def getSiteMaskLogging(self, site=None, printOutput=False): """Retrieves site mask logging information. Example usage: >>> gLogger.notice(diracAdmin.getSiteMaskLogging('LCG.AUVER.fr')) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result if self.rssFlag: result = ResourceStatusClient().selectStatusElement('Site', 'History', name=site) else: result = WMSAdministratorClient().getSiteMaskLogging(site) if not result['OK']: return result if printOutput: if site: gLogger.notice('\nSite Mask Logging Info for %s\n' % site) else: gLogger.notice('\nAll Site Mask Logging Info\n') sitesLogging = result['Value'] if isinstance(sitesLogging, dict): for siteName, tupleList in sitesLogging.items( ): # can be an iterator if not siteName: gLogger.notice('\n===> %s\n' % siteName) for tup in tupleList: stup = str(tup[0]).ljust(8) + str(tup[1]).ljust(20) stup += '( ' + str(tup[2]).ljust(len(str( tup[2]))) + ' ) "' + str(tup[3]) + '"' gLogger.notice(stup) gLogger.notice(' ') elif isinstance(sitesLogging, list): sitesLoggingList = [(sl[1], sl[3], sl[4]) for sl in sitesLogging] for siteLog in sitesLoggingList: gLogger.notice(siteLog) return S_OK()
def initialize(self): """Sets defaults """ self.am_setOption('PollingTime', 120) self.am_setOption('GridEnv', '') self.am_setOption('PilotStalledDays', 3) self.pilotDB = PilotAgentsDB() self.diracadmin = DiracAdmin() self.jobDB = JobDB() self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30) self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay', 7) self.WMSAdministrator = WMSAdministratorClient() return S_OK()
def getPilotInfo(self, gridReference): """Retrieve info relative to a pilot reference >>> print dirac.getPilotInfo(12345) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :type gridReference: string :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') result = WMSAdministratorClient().getPilotInfo(gridReference) return result
def getJobPilots(self, jobID): """Extract the list of submitted pilots and their status for a given jobID from the WMS. Useful information is printed to the screen. >>> print dirac.getJobPilots() {'OK': True, 'Value': {PilotID:{StatusDict}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if isinstance(jobID, basestring): try: jobID = int(jobID) except Exception as x: return self._errorReport( str(x), 'Expected integer or string for existing jobID') result = WMSAdministratorClient().getPilots(jobID) if result['OK']: print self.pPrint.pformat(result['Value']) return result
def __getJobPilotStatus(self, jobID): """ Get the job pilot status """ result = JobMonitoringClient().getJobParameter(jobID, 'Pilot_Reference') if not result['OK']: return result pilotReference = result['Value'].get('Pilot_Reference') if not pilotReference: # There is no pilot reference, hence its status is unknown return S_OK('NoPilot') result = WMSAdministratorClient().getPilotInfo(pilotReference) if not result['OK']: if "No pilots found" in result['Message']: self.log.warn(result['Message']) return S_OK('NoPilot') self.log.error('Failed to get pilot information', 'for job %d: ' % jobID + result['Message']) return S_ERROR('Failed to get the pilot status') pilotStatus = result['Value'][pilotReference]['Status'] return S_OK(pilotStatus)
def getPilotSummary(self, startDate='', endDate=''): """Retrieve the pilot output for an existing job in the WMS. Summary is printed at INFO level, full dictionary of results also returned. >>> print dirac.getPilotSummary() {'OK': True, 'Value': {CE:{Status:Count}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ result = WMSAdministratorClient().getPilotSummary(startDate, endDate) if not result['OK']: return result ceDict = result['Value'] headers = 'CE'.ljust(28) i = 0 for ce, summary in ceDict.iteritems(): states = summary.keys() if len(states) > i: i = len(states) for i in xrange(i): headers += 'Status'.ljust(12) + 'Count'.ljust(12) print headers for ce, summary in ceDict.iteritems(): line = ce.ljust(28) states = sorted(summary) for state in states: count = str(summary[state]) line += state.ljust(12) + count.ljust(12) print line return result
class PilotStatusAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ queryStateList = ['Ready', 'Submitted', 'Running', 'Waiting', 'Scheduled'] finalStateList = ['Done', 'Aborted', 'Cleared', 'Deleted', 'Failed'] def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) self.jobDB = None self.pilotDB = None self.diracadmin = None ############################################################################# def initialize(self): """Sets defaults """ self.am_setOption('PollingTime', 120) self.am_setOption('GridEnv', '') self.am_setOption('PilotStalledDays', 3) self.pilotDB = PilotAgentsDB() self.diracadmin = DiracAdmin() self.jobDB = JobDB() self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30) self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay', 7) self.WMSAdministrator = WMSAdministratorClient() return S_OK() ############################################################################# def execute(self): """The PilotAgent execution method. """ self.pilotStalledDays = self.am_getOption('PilotStalledDays', 3) self.gridEnv = self.am_getOption('GridEnv') if not self.gridEnv: # No specific option found, try a general one setup = gConfig.getValue('/DIRAC/Setup', '') if setup: instance = gConfig.getValue('/DIRAC/Setups/%s/WorkloadManagement' % setup, '') if instance: self.gridEnv = gConfig.getValue('/Systems/WorkloadManagement/%s/GridEnv' % instance, '') result = self.pilotDB._getConnection() if result['OK']: connection = result['Value'] else: return result # Now handle pilots not updated in the last N days (most likely the Broker is no # longer available) and declare them Deleted. result = self.handleOldPilots(connection) connection.close() result = self.WMSAdministrator.clearPilots(self.clearPilotsDelay, self.clearAbortedDelay) if not result['OK']: self.log.warn('Failed to clear old pilots in the PilotAgentsDB') return S_OK() def clearWaitingPilots(self, condDict): """ Clear pilots in the faulty Waiting state """ last_update = Time.dateTime() - MAX_WAITING_STATE_LENGTH * Time.hour clearDict = {'Status': 'Waiting', 'OwnerDN': condDict['OwnerDN'], 'OwnerGroup': condDict['OwnerGroup'], 'GridType': condDict['GridType'], 'Broker': condDict['Broker']} result = self.pilotDB.selectPilots(clearDict, older=last_update) if not result['OK']: self.log.warn('Failed to get the Pilot Agents for Waiting state') return result if not result['Value']: return S_OK() refList = result['Value'] for pilotRef in refList: self.log.info('Setting Waiting pilot to Stalled: %s' % pilotRef) result = self.pilotDB.setPilotStatus(pilotRef, 'Stalled', statusReason='Exceeded max waiting time') return S_OK() def clearParentJob(self, pRef, pDict, connection): """ Clear the parameteric parent job from the PilotAgentsDB """ childList = pDict['ChildRefs'] # Check that at least one child is in the database children_ok = False for child in childList: result = self.pilotDB.getPilotInfo(child, conn=connection) if result['OK']: if result['Value']: children_ok = True if children_ok: return self.pilotDB.deletePilot(pRef, conn=connection) else: self.log.verbose('Adding children for parent %s' % pRef) result = self.pilotDB.getPilotInfo(pRef) parentInfo = result['Value'][pRef] tqID = parentInfo['TaskQueueID'] ownerDN = parentInfo['OwnerDN'] ownerGroup = parentInfo['OwnerGroup'] broker = parentInfo['Broker'] gridType = parentInfo['GridType'] result = self.pilotDB.addPilotTQReference(childList, tqID, ownerDN, ownerGroup, broker=broker, gridType=gridType) if not result['OK']: return result children_added = True for chRef, chDict in pDict['ChildDicts'].items(): result = self.pilotDB.setPilotStatus(chRef, chDict['Status'], destination=chDict['DestinationSite'], conn=connection) if not result['OK']: children_added = False if children_added: result = self.pilotDB.deletePilot(pRef, conn=connection) else: return S_ERROR('Failed to add children') return S_OK() def handleOldPilots(self, connection): """ select all pilots that have not been updated in the last N days and declared them Deleted, accounting for them. """ pilotsToAccount = {} timeLimitToConsider = Time.toString(Time.dateTime() - Time.day * self.pilotStalledDays) result = self.pilotDB.selectPilots({'Status': self.queryStateList}, older=timeLimitToConsider, timeStamp='LastUpdateTime') if not result['OK']: self.log.error('Failed to get the Pilot Agents') return result if not result['Value']: return S_OK() refList = result['Value'] result = self.pilotDB.getPilotInfo(refList) if not result['OK']: self.log.error('Failed to get Info for Pilot Agents') return result pilotsDict = result['Value'] for pRef in pilotsDict: if pilotsDict[pRef].get('Jobs') and self._checkJobLastUpdateTime(pilotsDict[pRef]['Jobs'], self.pilotStalledDays): self.log.debug('%s should not be deleted since one job of %s is running.' % (str(pRef), str(pilotsDict[pRef]['Jobs']))) continue deletedJobDict = pilotsDict[pRef] deletedJobDict['Status'] = 'Deleted' deletedJobDict['StatusDate'] = Time.dateTime() pilotsToAccount[pRef] = deletedJobDict if len(pilotsToAccount) > 100: self.accountPilots(pilotsToAccount, connection) self._killPilots(pilotsToAccount) pilotsToAccount = {} self.accountPilots(pilotsToAccount, connection) self._killPilots(pilotsToAccount) return S_OK() def accountPilots(self, pilotsToAccount, connection): """ account for pilots """ accountingFlag = False pae = self.am_getOption('PilotAccountingEnabled', 'yes') if pae.lower() == "yes": accountingFlag = True if not pilotsToAccount: self.log.info('No pilots to Account') return S_OK() accountingSent = False if accountingFlag: retVal = self.pilotDB.getPilotInfo(pilotsToAccount.keys(), conn=connection) if not retVal['OK']: self.log.error('Fail to retrieve Info for pilots', retVal['Message']) return retVal dbData = retVal['Value'] for pref in dbData: if pref in pilotsToAccount: if dbData[pref]['Status'] not in self.finalStateList: dbData[pref]['Status'] = pilotsToAccount[pref]['Status'] dbData[pref]['DestinationSite'] = pilotsToAccount[pref]['DestinationSite'] dbData[pref]['LastUpdateTime'] = pilotsToAccount[pref]['StatusDate'] retVal = self.__addPilotsAccountingReport(dbData) if not retVal['OK']: self.log.error('Fail to retrieve Info for pilots', retVal['Message']) return retVal self.log.info("Sending accounting records...") retVal = gDataStoreClient.commit() if not retVal['OK']: self.log.error("Can't send accounting reports", retVal['Message']) else: self.log.info("Accounting sent for %s pilots" % len(pilotsToAccount)) accountingSent = True if not accountingFlag or accountingSent: for pRef in pilotsToAccount: pDict = pilotsToAccount[pRef] self.log.verbose('Setting Status for %s to %s' % (pRef, pDict['Status'])) self.pilotDB.setPilotStatus(pRef, pDict['Status'], pDict['DestinationSite'], pDict['StatusDate'], conn=connection) return S_OK() def __addPilotsAccountingReport(self, pilotsData): """ fill accounting data """ for pRef in pilotsData: pData = pilotsData[pRef] pA = PilotAccounting() pA.setEndTime(pData['LastUpdateTime']) pA.setStartTime(pData['SubmissionTime']) retVal = CS.getUsernameForDN(pData['OwnerDN']) if not retVal['OK']: userName = '******' self.log.error("Can't determine username for dn:", pData['OwnerDN']) else: userName = retVal['Value'] pA.setValueByKey('User', userName) pA.setValueByKey('UserGroup', pData['OwnerGroup']) result = getSiteForCE(pData['DestinationSite']) if result['OK'] and result['Value'].strip(): pA.setValueByKey('Site', result['Value'].strip()) else: pA.setValueByKey('Site', 'Unknown') pA.setValueByKey('GridCE', pData['DestinationSite']) pA.setValueByKey('GridMiddleware', pData['GridType']) pA.setValueByKey('GridResourceBroker', pData['Broker']) pA.setValueByKey('GridStatus', pData['Status']) if 'Jobs' not in pData: pA.setValueByKey('Jobs', 0) else: pA.setValueByKey('Jobs', len(pData['Jobs'])) self.log.verbose("Added accounting record for pilot %s" % pData['PilotID']) retVal = gDataStoreClient.addRegister(pA) if not retVal['OK']: return retVal return S_OK() def _killPilots(self, acc): for i in sorted(acc.keys()): result = self.diracadmin.getPilotInfo(i) if result['OK'] and i in result['Value'] and 'Status' in result['Value'][i]: ret = self.diracadmin.killPilot(str(i)) if ret['OK']: self.log.info("Successfully deleted: %s (Status : %s)" % (i, result['Value'][i]['Status'])) else: self.log.error("Failed to delete pilot: ", "%s : %s" % (i, ret['Message'])) else: self.log.error("Failed to get pilot info", "%s : %s" % (i, str(result))) def _checkJobLastUpdateTime(self, joblist, StalledDays): timeLimitToConsider = Time.dateTime() - Time.day * StalledDays ret = False for jobID in joblist: result = self.jobDB.getJobAttributes(int(jobID)) if result['OK']: if 'LastUpdateTime' in result['Value']: lastUpdateTime = result['Value']['LastUpdateTime'] if Time.fromString(lastUpdateTime) > timeLimitToConsider: ret = True self.log.debug( 'Since %s updates LastUpdateTime on %s this does not to need to be deleted.' % (str(jobID), str(lastUpdateTime))) break else: self.log.error("Error taking job info from DB", result['Message']) return ret
class JobCommand(Command): """ Job "master" Command. """ def __init__(self, args=None, clients=None): super(JobCommand, self).__init__(args, clients) if "WMSAdministrator" in self.apis: self.wmsAdmin = self.apis["WMSAdministrator"] else: self.wmsAdmin = WMSAdministratorClient() if "ResourceManagementClient" in self.apis: self.rmClient = self.apis["ResourceManagementClient"] else: self.rmClient = ResourceManagementClient() def _storeCommand(self, result): """ Stores the results of doNew method on the database. """ for jobDict in result: resQuery = self.rmClient.addOrModifyJobCache( jobDict["Site"], jobDict["MaskStatus"], jobDict["Efficiency"], jobDict["Status"]) if not resQuery["OK"]: return resQuery return S_OK() def _prepareCommand(self): """ JobCommand requires one arguments: - name : <str> """ if "name" not in self.args: return S_ERROR('"name" not found in self.args') name = self.args["name"] return S_OK(name) def doNew(self, masterParams=None): """ Gets the parameters to run, either from the master method or from its own arguments. It contacts the WMSAdministrator with a list of site names, or a single site. If there are jobs, are recorded and then returned. """ if masterParams is not None: name = masterParams else: params = self._prepareCommand() if not params["OK"]: return params name = params["Value"] # selectDict, sortList, startItem, maxItems # Returns statistics of Last day ! results = self.wmsAdmin.getSiteSummaryWeb({"Site": name}, [], 0, 0) if not results["OK"]: return results results = results["Value"] if "ParameterNames" not in results: return S_ERROR('Wrong result dictionary, missing "ParameterNames"') params = results["ParameterNames"] if "Records" not in results: return S_ERROR('Wrong formed result dictionary, missing "Records"') records = results["Records"] uniformResult = [] for record in records: # This returns a dictionary with the following keys # 'Site', 'GridType', 'Country', 'Tier', 'MaskStatus', 'Received', # 'Checking', 'Staging', 'Waiting', 'Matched', 'Running', 'Stalled', # 'Done', 'Completed', 'Failed', 'Efficiency', 'Status' jobDict = dict(zip(params, record)) # We cast efficiency to a float jobDict["Efficiency"] = float(jobDict["Efficiency"]) uniformResult.append(jobDict) storeRes = self._storeCommand(uniformResult) if not storeRes["OK"]: return storeRes return S_OK(uniformResult) def doCache(self): """ Method that reads the cache table and tries to read from it. It will return a list of dictionaries if there are results. """ params = self._prepareCommand() if not params["OK"]: return params name = params["Value"] result = self.rmClient.selectJobCache(name) if result["OK"]: result = S_OK( [dict(zip(result["Columns"], res)) for res in result["Value"]]) return result def doMaster(self): """ Master method. Gets all sites and calls doNew method. """ siteNames = getSites() if not siteNames["OK"]: return siteNames siteNames = siteNames["Value"] jobsResults = self.doNew(siteNames) if not jobsResults["OK"]: self.metrics["failed"].append(jobsResults["Message"]) return S_OK(self.metrics)
class PilotCommand(Command): """ Pilot "master" Command. """ def __init__(self, args=None, clients=None): super(PilotCommand, self).__init__(args, clients) if 'WMSAdministrator' in self.apis: self.wmsAdmin = self.apis['WMSAdministrator'] else: self.wmsAdmin = WMSAdministratorClient() if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis['ResourceManagementClient'] else: self.rmClient = ResourceManagementClient() def _storeCommand(self, result): """ Stores the results of doNew method on the database. """ for pilotDict in result: resQuery = self.rmClient.addOrModifyPilotCache(pilotDict['Site'], pilotDict['CE'], pilotDict['PilotsPerJob'], pilotDict['PilotJobEff'], pilotDict['Status']) if not resQuery['OK']: return resQuery return S_OK() def _prepareCommand(self): """ JobCommand requires one arguments: - name : <str> """ if 'name' not in self.args: return S_ERROR('"name" not found in self.args') name = self.args['name'] if 'element' not in self.args: return S_ERROR('element is missing') element = self.args['element'] if element not in ['Site', 'Resource']: return S_ERROR('"%s" is not Site nor Resource' % element) return S_OK((element, name)) def doNew(self, masterParams=None): if masterParams is not None: element, name = masterParams else: params = self._prepareCommand() if not params['OK']: return params element, name = params['Value'] wmsDict = {} if element == 'Site': wmsDict = {'GridSite': name} elif element == 'Resource': wmsDict = {'ExpandSite': name} else: # You should never see this error return S_ERROR('"%s" is not Site nor Resource' % element) wmsResults = self.wmsAdmin.getPilotSummaryWeb(wmsDict, [], 0, 0) if not wmsResults['OK']: return wmsResults wmsResults = wmsResults['Value'] if 'ParameterNames' not in wmsResults: return S_ERROR('Wrong result dictionary, missing "ParameterNames"') params = wmsResults['ParameterNames'] if 'Records' not in wmsResults: return S_ERROR('Wrong formed result dictionary, missing "Records"') records = wmsResults['Records'] uniformResult = [] for record in records: # This returns a dictionary with the following keys: # 'Site', 'CE', 'Submitted', 'Ready', 'Scheduled', 'Waiting', 'Running', # 'Done', 'Aborted', 'Done_Empty', 'Aborted_Hour', 'Total', 'PilotsPerJob', # 'PilotJobEff', 'Status', 'InMask' pilotDict = dict(zip(params, record)) pilotDict['PilotsPerJob'] = float(pilotDict['PilotsPerJob']) pilotDict['PilotJobEff'] = float(pilotDict['PilotJobEff']) uniformResult.append(pilotDict) storeRes = self._storeCommand(uniformResult) if not storeRes['OK']: return storeRes return S_OK(uniformResult) def doCache(self): params = self._prepareCommand() if not params['OK']: return params element, name = params['Value'] if element == 'Site': # WMS returns Site entries with CE = 'Multiple' site, ce = name, 'Multiple' elif element == 'Resource': site, ce = None, name else: # You should never see this error return S_ERROR('"%s" is not Site nor Resource' % element) result = self.rmClient.selectPilotCache(site, ce) if result['OK']: result = S_OK([dict(zip(result['Columns'], res)) for res in result['Value']]) return result def doMaster(self): siteNames = getSites() if not siteNames['OK']: return siteNames siteNames = siteNames['Value'] ces = CSHelpers.getComputingElements() if not ces['OK']: return ces ces = ces['Value'] pilotResults = self.doNew(('Site', siteNames)) if not pilotResults['OK']: self.metrics['failed'].append(pilotResults['Message']) pilotResults = self.doNew(('Resource', ces)) if not pilotResults['OK']: self.metrics['failed'].append(pilotResults['Message']) return S_OK(self.metrics)
labels = [ 'pilotUUID', 'timestamp', 'source', 'phase', 'status', 'messageContent' ] for log in logs: content.append([log[label] for label in labels]) printTable(labels, content, numbering=False, columnSeparator=' | ') from DIRAC.WorkloadManagementSystem.Client.PilotsLoggingClient import PilotsLoggingClient from DIRAC.WorkloadManagementSystem.Client.WMSAdministratorClient import WMSAdministratorClient if uuid: result = PilotsLoggingClient().getPilotsLogging(uuid) if not result['OK']: print 'ERROR: %s' % result['Message'] DIRAC.exit(1) printPilotsLogging(result['Value']) DIRAC.exit(0) else: info = WMSAdministratorClient().getPilots(jobid) if not info['OK']: print info['Message'] DIRAC.exit(1) for pilot in info['Value']: logging = PilotsLoggingClient().getPilotsLogging( pilot['PilotJobReference']) if not logging['OK']: print logging['Message'] printPilotsLogging(logging) DIRAC.exit(0)
def test_JobDBWMSAdmin(self): wmsAdministrator = WMSAdministratorClient() sitesList = ['My.Site.org', 'Your.Site.org'] res = wmsAdministrator.setSiteMask(sitesList) self.assertTrue(res['OK']) res = wmsAdministrator.getSiteMask() self.assertTrue(res['OK']) self.assertEqual(sorted(res['Value']), sorted(sitesList)) res = wmsAdministrator.banSite('My.Site.org', 'This is a comment') self.assertTrue(res['OK']) res = wmsAdministrator.getSiteMask() self.assertTrue(res['OK']) self.assertEqual(sorted(res['Value']), ['Your.Site.org']) res = wmsAdministrator.allowSite('My.Site.org', 'This is a comment') self.assertTrue(res['OK']) res = wmsAdministrator.getSiteMask() self.assertTrue(res['OK']) self.assertEqual(sorted(res['Value']), sorted(sitesList)) res = wmsAdministrator.getSiteMaskLogging(sitesList) self.assertTrue(res['OK']) self.assertEqual(res['Value']['My.Site.org'][0][3], 'No comment') res = wmsAdministrator.getSiteMaskSummary() self.assertTrue(res['OK']) self.assertEqual(res['Value']['My.Site.org'], 'Active') res = wmsAdministrator.getSiteSummaryWeb({}, [], 0, 100) self.assertTrue(res['OK']) self.assertTrue(res['Value']['TotalRecords'] in [0, 1, 2, 34]) res = wmsAdministrator.getSiteSummarySelectors() self.assertTrue(res['OK']) res = wmsAdministrator.clearMask() self.assertTrue(res['OK']) res = wmsAdministrator.getSiteMask() self.assertTrue(res['OK']) self.assertEqual(res['Value'], [])
class CloudDirector(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) self.vmTypeDict = {} self.vmTypeCECache = {} self.vmTypeSlots = {} self.failedVMTypes = defaultdict(int) self.firstPass = True self.vo = '' self.group = '' # self.voGroups contain all the eligible user groups for clouds submitted by this SiteDirector self.voGroups = [] self.cloudDN = '' self.cloudGroup = '' self.platforms = [] self.sites = [] self.wmsClient = WMSAdministratorClient() self.proxy = None self.updateStatus = True self.getOutput = False self.sendAccounting = True def initialize(self): """ Standard constructor """ return S_OK() def beginExecution(self): # The Director is for a particular user community self.vo = self.am_getOption("VO", '') if not self.vo: self.vo = CSGlobals.getVO() # The SiteDirector is for a particular user group self.group = self.am_getOption("Group", '') # Choose the group for which clouds will be submitted. This is a hack until # we will be able to match clouds to VOs. if not self.group: if self.vo: result = Registry.getGroupsForVO(self.vo) if not result['OK']: return result self.voGroups = [] for group in result['Value']: if 'NormalUser' in Registry.getPropertiesForGroup(group): self.voGroups.append(group) else: self.voGroups = [self.group] result = findGenericCloudCredentials(vo=self.vo) if not result['OK']: return result self.cloudDN, self.cloudGroup = result['Value'] self.maxVMsToSubmit = self.am_getOption('MaxVMsToSubmit', 1) self.runningPod = self.am_getOption('RunningPod', self.vo) # Get the site description dictionary siteNames = None if not self.am_getOption('Site', 'Any').lower() == "any": siteNames = self.am_getOption('Site', []) if not siteNames: siteNames = None ces = None if not self.am_getOption('CEs', 'Any').lower() == "any": ces = self.am_getOption('CEs', []) if not ces: ces = None result = getVMTypes(vo=self.vo, siteList=siteNames) if not result['OK']: return result resourceDict = result['Value'] result = self.getEndpoints(resourceDict) if not result['OK']: return result # if not siteNames: # siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' ) # if siteName == 'Unknown': # return S_OK( 'No site specified for the SiteDirector' ) # else: # siteNames = [siteName] #self.siteNames = siteNames self.log.always('Sites:', siteNames) self.log.always('CEs:', ces) self.log.always('CloudDN:', self.cloudDN) self.log.always('CloudGroup:', self.cloudGroup) self.localhost = socket.getfqdn() self.proxy = '' if self.firstPass: if self.vmTypeDict: self.log.always("Agent will serve VM types:") for vmType in self.vmTypeDict: self.log.always( "Site: %s, CE: %s, VMType: %s" % (self.vmTypeDict[vmType]['Site'], self.vmTypeDict[vmType]['CEName'], vmType)) self.firstPass = False return S_OK() def __generateVMTypeHash(self, vmTypeDict): """ Generate a hash of the queue description """ myMD5 = hashlib.md5() myMD5.update(str(vmTypeDict)) hexstring = myMD5.hexdigest() return hexstring def getEndpoints(self, resourceDict): """ Get the list of relevant CEs and their descriptions """ self.vmTypeDict = {} ceFactory = EndpointFactory() result = getPilotBootstrapParameters(vo=self.vo, runningPod=self.runningPod) if not result['OK']: return result opParameters = result['Value'] for site in resourceDict: for ce in resourceDict[site]: ceDict = resourceDict[site][ce] ceTags = ceDict.get('Tag', []) if isinstance(ceTags, basestring): ceTags = fromChar(ceTags) ceMaxRAM = ceDict.get('MaxRAM', None) qDict = ceDict.pop('VMTypes') for vmType in qDict: vmTypeName = '%s_%s' % (ce, vmType) self.vmTypeDict[vmTypeName] = {} self.vmTypeDict[vmTypeName]['ParametersDict'] = qDict[ vmType] self.vmTypeDict[vmTypeName]['ParametersDict'][ 'VMType'] = vmType self.vmTypeDict[vmTypeName]['ParametersDict'][ 'Site'] = site self.vmTypeDict[vmTypeName]['ParametersDict'][ 'Setup'] = gConfig.getValue('/DIRAC/Setup', 'unknown') self.vmTypeDict[vmTypeName]['ParametersDict'][ 'CPUTime'] = 99999999 vmTypeTags = self.vmTypeDict[vmTypeName][ 'ParametersDict'].get('Tag') if vmTypeTags and isinstance(vmTypeTags, basestring): vmTypeTags = fromChar(vmTypeTags) self.vmTypeDict[vmTypeName]['ParametersDict'][ 'Tag'] = vmTypeTags if ceTags: if vmTypeTags: allTags = list(set(ceTags + vmTypeTags)) self.vmTypeDict[vmTypeName]['ParametersDict'][ 'Tag'] = allTags else: self.vmTypeDict[vmTypeName]['ParametersDict'][ 'Tag'] = ceTags maxRAM = self.vmTypeDict[vmTypeName]['ParametersDict'].get( 'MaxRAM') maxRAM = ceMaxRAM if not maxRAM else maxRAM if maxRAM: self.vmTypeDict[vmTypeName]['ParametersDict'][ 'MaxRAM'] = maxRAM ceWholeNode = ceDict.get('WholeNode', 'true') wholeNode = self.vmTypeDict[vmTypeName][ 'ParametersDict'].get('WholeNode', ceWholeNode) if wholeNode.lower() in ('yes', 'true'): self.vmTypeDict[vmTypeName][ 'ParametersDict'].setdefault('Tag', []) self.vmTypeDict[vmTypeName]['ParametersDict'][ 'Tag'].append('WholeNode') platform = '' if "Platform" in self.vmTypeDict[vmTypeName][ 'ParametersDict']: platform = self.vmTypeDict[vmTypeName][ 'ParametersDict']['Platform'] elif "Platform" in ceDict: platform = ceDict['Platform'] if platform and platform not in self.platforms: self.platforms.append(platform) if "Platform" not in self.vmTypeDict[vmTypeName][ 'ParametersDict'] and platform: result = Resources.getDIRACPlatform(platform) if result['OK']: self.vmTypeDict[vmTypeName]['ParametersDict'][ 'Platform'] = result['Value'][0] ceVMTypeDict = dict(ceDict) ceVMTypeDict['CEName'] = ce ceVMTypeDict['VO'] = self.vo ceVMTypeDict['VMType'] = vmType ceVMTypeDict['RunningPod'] = self.runningPod ceVMTypeDict['CSServers'] = gConfig.getValue( "/DIRAC/Configuration/Servers", []) ceVMTypeDict.update( self.vmTypeDict[vmTypeName]['ParametersDict']) # Allow a resource-specifc CAPath to be set (as some clouds have their own CAs) # Otherwise fall back to the system-wide default(s) if 'CAPath' not in ceVMTypeDict: ceVMTypeDict['CAPath'] = gConfig.getValue( '/DIRAC/Security/CAPath', "/opt/dirac/etc/grid-security/certificates/cas.pem" ) # Generate the CE object for the vmType or pick the already existing one # if the vmType definition did not change vmTypeHash = self.__generateVMTypeHash(ceVMTypeDict) if vmTypeName in self.vmTypeCECache and self.vmTypeCECache[ vmTypeName]['Hash'] == vmTypeHash: vmTypeCE = self.vmTypeCECache[vmTypeName]['CE'] else: result = ceFactory.getCEObject(parameters=ceVMTypeDict) if not result['OK']: return result self.vmTypeCECache.setdefault(vmTypeName, {}) self.vmTypeCECache[vmTypeName]['Hash'] = vmTypeHash self.vmTypeCECache[vmTypeName]['CE'] = result['Value'] vmTypeCE = self.vmTypeCECache[vmTypeName]['CE'] vmTypeCE.setBootstrapParameters(opParameters) self.vmTypeDict[vmTypeName]['CE'] = vmTypeCE self.vmTypeDict[vmTypeName]['CEName'] = ce self.vmTypeDict[vmTypeName]['CEType'] = ceDict['CEType'] self.vmTypeDict[vmTypeName]['Site'] = site self.vmTypeDict[vmTypeName]['VMType'] = vmType self.vmTypeDict[vmTypeName]['Platform'] = platform self.vmTypeDict[vmTypeName]['MaxInstances'] = ceDict[ 'MaxInstances'] if not self.vmTypeDict[vmTypeName]['CE'].isValid(): self.log.error( 'Failed to instantiate CloudEndpoint for %s' % vmTypeName) continue if site not in self.sites: self.sites.append(site) return S_OK() def execute(self): """ Main execution method """ if not self.vmTypeDict: self.log.warn('No site defined, exiting the cycle') return S_OK() result = self.createVMs() if not result['OK']: self.log.error('Errors in the job submission: ', result['Message']) #cyclesDone = self.am_getModuleParam( 'cyclesDone' ) # if self.updateStatus and cyclesDone % self.cloudStatusUpdateCycleFactor == 0: # result = self.updatePilotStatus() # if not result['OK']: # self.log.error( 'Errors in updating cloud status: ', result['Message'] ) return S_OK() def createVMs(self): """ Go through defined computing elements and submit jobs if necessary """ vmTypeList = self.vmTypeDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {'Setup': setup, 'CPUTime': 9999999} if self.vo: tqDict['VO'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for vmType in vmTypeList: if 'Tag' in self.vmTypeDict[vmType]['ParametersDict']: tags += self.vmTypeDict[vmType]['ParametersDict']['Tag'] tqDict['Tag'] = list(set(tags)) tqDict['SubmitPool'] = "wenmrPool" self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() result = virtualMachineDB.getInstanceCounters('Status', {}) totalVMs = 0 if result['OK']: for status in result['Value']: if status in ['New', 'Submitted', 'Running']: totalVMs += result['Value'][status] self.log.info('Total %d jobs in %d task queues with %d VMs' % (totalWaitingJobs, len(tqIDList), totalVMs)) # Check if the site is allowed in the mask result = self.wmsClient.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] vmTypeList = self.vmTypeDict.keys() random.shuffle(vmTypeList) totalSubmittedPilots = 0 matchedQueues = 0 for vmType in vmTypeList: ce = self.vmTypeDict[vmType]['CE'] ceName = self.vmTypeDict[vmType]['CEName'] vmTypeName = self.vmTypeDict[vmType]['VMType'] siteName = self.vmTypeDict[vmType]['Site'] platform = self.vmTypeDict[vmType]['Platform'] vmTypeTags = self.vmTypeDict[vmType]['ParametersDict'].get( 'Tag', []) siteMask = siteName in siteMaskList endpoint = "%s::%s" % (siteName, ceName) maxInstances = int(self.vmTypeDict[vmType]['MaxInstances']) processorTags = [] # vms support WholeNode naturally processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (vmTypeName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose("Skipping queue %s: site %s not in the mask" % (vmTypeName, siteName)) continue if 'CPUTime' in self.vmTypeDict[vmType]['ParametersDict']: vmTypeCPUTime = int( self.vmTypeDict[vmType]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % vmType) continue # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['VO'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = list(set(processorTags + vmTypeTags)) # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % vmType) continue matchedQueues += 1 totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), vmType)) # Get the number of already instantiated VMs for these task queues totalWaitingVMs = 0 result = virtualMachineDB.getInstanceCounters( 'Status', {'Endpoint': endpoint}) if result['OK']: for status in result['Value']: if status in ['New', 'Submitted']: totalWaitingVMs += result['Value'][status] if totalWaitingVMs >= totalTQJobs: self.log.verbose("%d VMs already for all the available jobs" % totalWaitingVMs) self.log.verbose( "%d VMs for the total of %d eligible jobs for %s" % (totalWaitingVMs, totalTQJobs, vmType)) # Get proxy to be used to connect to the cloud endpoint authType = ce.parameters.get('Auth') if authType and authType.lower() in ['x509', 'voms']: self.log.verbose("Getting cloud proxy for %s/%s" % (siteName, ceName)) result = getProxyFileForCE(ce) if not result['OK']: continue ce.setProxy(result['Value']) # Get the number of available slots on the target site/endpoint totalSlots = self.getVMInstances(endpoint, maxInstances) if totalSlots == 0: self.log.debug('%s: No slots available' % vmType) continue vmsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingVMs)) self.log.info('%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d' % (vmType, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit)) # Limit the number of VM instances to create to vmsToSubmit vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit) if vmsToSubmit == 0: continue self.log.info('Going to submit %d VMs to %s queue' % (vmsToSubmit, vmType)) result = ce.createInstances(vmsToSubmit) #result = S_OK() if not result['OK']: self.log.error('Failed submission to queue %s:\n' % vmType, result['Message']) self.failedVMTypes.setdefault(vmType, 0) self.failedVMTypes[vmType] += 1 continue # Add VMs to the VirtualMachineDB vmDict = result['Value'] totalSubmittedPilots += len(vmDict) self.log.info('Submitted %d VMs to %s@%s' % (len(vmDict), vmTypeName, ceName)) pilotList = [] for uuID in vmDict: diracUUID = vmDict[uuID]['InstanceID'] endpoint = '%s::%s' % (self.vmTypeDict[vmType]['Site'], ceName) result = virtualMachineDB.insertInstance( uuID, vmTypeName, diracUUID, endpoint, self.vo) if not result['OK']: continue for ncpu in range(vmDict[uuID]['NumberOfProcessors']): pRef = 'vm://' + ceName + '/' + diracUUID + ':' + str( ncpu).zfill(2) pilotList.append(pRef) stampDict = {} tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if tqID not in tqDict: tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, '', '', self.localhost, 'Cloud', stampDict) if not result['OK']: self.log.error( 'Failed to insert pilots into the PilotAgentsDB: %s' % result['Message']) self.log.info( "%d VMs submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK() def getVMInstances(self, endpoint, maxInstances): result = virtualMachineDB.getInstanceCounters('Status', {'Endpoint': endpoint}) if not result['OK']: return result count = 0 for status in result['Value']: if status in ['New', 'Submitted', 'Running']: count += int(result['Value'][status]) return max(0, maxInstances - count)
class PilotCommand(Command): """ Pilot "master" Command. """ def __init__(self, args=None, clients=None): super(PilotCommand, self).__init__(args, clients) if 'WMSAdministrator' in self.apis: self.wmsAdmin = self.apis['WMSAdministrator'] else: self.wmsAdmin = WMSAdministratorClient() if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis['ResourceManagementClient'] else: self.rmClient = ResourceManagementClient() def _storeCommand(self, result): """ Stores the results of doNew method on the database. """ for pilotDict in result: resQuery = self.rmClient.addOrModifyPilotCache( pilotDict['Site'], pilotDict['CE'], pilotDict['PilotsPerJob'], pilotDict['PilotJobEff'], pilotDict['Status']) if not resQuery['OK']: return resQuery return S_OK() def _prepareCommand(self): """ JobCommand requires one arguments: - name : <str> """ if 'name' not in self.args: return S_ERROR('"name" not found in self.args') name = self.args['name'] if 'element' not in self.args: return S_ERROR('element is missing') element = self.args['element'] if element not in ['Site', 'Resource']: return S_ERROR('"%s" is not Site nor Resource' % element) return S_OK((element, name)) def doNew(self, masterParams=None): if masterParams is not None: element, name = masterParams else: params = self._prepareCommand() if not params['OK']: return params element, name = params['Value'] wmsDict = {} if element == 'Site': wmsDict = {'GridSite': name} elif element == 'Resource': wmsDict = {'ExpandSite': name} else: # You should never see this error return S_ERROR('"%s" is not Site nor Resource' % element) wmsResults = self.wmsAdmin.getPilotSummaryWeb(wmsDict, [], 0, 0) if not wmsResults['OK']: return wmsResults wmsResults = wmsResults['Value'] if 'ParameterNames' not in wmsResults: return S_ERROR('Wrong result dictionary, missing "ParameterNames"') params = wmsResults['ParameterNames'] if 'Records' not in wmsResults: return S_ERROR('Wrong formed result dictionary, missing "Records"') records = wmsResults['Records'] uniformResult = [] for record in records: # This returns a dictionary with the following keys: # 'Site', 'CE', 'Submitted', 'Ready', 'Scheduled', 'Waiting', 'Running', # 'Done', 'Aborted', 'Done_Empty', 'Aborted_Hour', 'Total', 'PilotsPerJob', # 'PilotJobEff', 'Status', 'InMask' pilotDict = dict(zip(params, record)) pilotDict['PilotsPerJob'] = float(pilotDict['PilotsPerJob']) pilotDict['PilotJobEff'] = float(pilotDict['PilotJobEff']) uniformResult.append(pilotDict) storeRes = self._storeCommand(uniformResult) if not storeRes['OK']: return storeRes return S_OK(uniformResult) def doCache(self): params = self._prepareCommand() if not params['OK']: return params element, name = params['Value'] if element == 'Site': # WMS returns Site entries with CE = 'Multiple' site, ce = name, 'Multiple' elif element == 'Resource': site, ce = None, name else: # You should never see this error return S_ERROR('"%s" is not Site nor Resource' % element) result = self.rmClient.selectPilotCache(site, ce) if result['OK']: result = S_OK( [dict(zip(result['Columns'], res)) for res in result['Value']]) return result def doMaster(self): siteNames = getSites() if not siteNames['OK']: return siteNames siteNames = siteNames['Value'] ces = CSHelpers.getComputingElements() if not ces['OK']: return ces ces = ces['Value'] pilotResults = self.doNew(('Site', siteNames)) if not pilotResults['OK']: self.metrics['failed'].append(pilotResults['Message']) pilotResults = self.doNew(('Resource', ces)) if not pilotResults['OK']: self.metrics['failed'].append(pilotResults['Message']) return S_OK(self.metrics)
class JobsWMSCommand(Command): def __init__(self, args=None, clients=None): super(JobsWMSCommand, self).__init__(args, clients) if "WMSAdministrator" in self.apis: self.wmsAdmin = self.apis["WMSAdministrator"] else: self.wmsAdmin = WMSAdministratorClient() def doCommand(self): """ Returns simple jobs efficiency :param args: - args[0]: string: should be a ValidElement - args[1]: string should be the name of the ValidElement :returns: { 'Result': 'Good'|'Fair'|'Poor'|'Idle'|'Bad' } """ if "siteName" not in self.args: return self.returnERROR(S_ERROR("siteName is missing")) siteName = self.args["siteName"] # If siteName is None, we take all sites if siteName is None: siteName = getSites() if not siteName["OK"]: return self.returnERROR(siteName) siteName = siteName["Value"] results = self.wmsAdmin.getSiteSummaryWeb({"Site": siteName}, [], 0, 500) if not results["OK"]: return self.returnERROR(results) results = results["Value"] if "ParameterNames" not in results: return self.returnERROR(S_ERROR("Malformed result dictionary")) params = results["ParameterNames"] if "Records" not in results: return self.returnERROR(S_ERROR("Malformed result dictionary")) records = results["Records"] jobResults = [] for record in records: jobDict = dict(zip(params, record)) try: jobDict["Efficiency"] = float(jobDict["Efficiency"]) except KeyError as e: return self.returnERROR(S_ERROR(e)) except ValueError as e: return self.returnERROR(S_ERROR(e)) jobResults.append(jobDict) return S_OK(jobResults)
def setSiteStatus(self, site, status, comment='No comment'): """ Set the status of a site in the 'SiteStatus' table of RSS examples >>> siteStatus.banSite( 'site1.test.test' ) S_OK() >>> siteStatus.banSite( None ) S_ERROR( ... ) :Parameters: **site** - `String` the site that is going to be banned **comment** - `String` reason for banning :return: S_OK() || S_ERROR() """ if not status: return S_ERROR(DErrno.ERESUNK, 'status parameter is empty') # fix case sensitive string status = status.capitalize() allowedStateList = [ 'Active', 'Banned', 'Degraded', 'Probing', 'Error', 'Unknown' ] if status not in allowedStateList: return S_ERROR(errno.EINVAL, 'Not a valid status, parameter rejected') if self.rssFlag: result = getProxyInfo() if result['OK']: tokenOwner = result['Value']['username'] else: return S_ERROR("Unable to get user proxy info %s " % result['Message']) tokenExpiration = datetime.utcnow() + timedelta(days=1) self.rssCache.acquireLock() try: result = self.rsClient.modifyStatusElement( 'Site', 'Status', status=status, name=site, tokenExpiration=tokenExpiration, reason=comment, tokenOwner=tokenOwner) if result['OK']: self.rssCache.refreshCache() else: _msg = 'Error updating status of site %s to %s' % (site, status) gLogger.warn('RSS: %s' % _msg) # Release lock, no matter what. finally: self.rssCache.releaseLock() else: if status in ['Active', 'Degraded']: result = WMSAdministratorClient().allowSite() else: result = WMSAdministratorClient().banSite() return result