def __checkSitesInMask( self, job, siteCandidates ): """Returns list of site candidates that are in current mask. """ siteStatus = SiteStatus() result = siteStatus.getUsableSites( 'ComputingAccess' ) if not result['OK']: return S_ERROR( 'Could not get site mask' ) sites = [] usableSites = result['Value'] for candidate in siteCandidates: if not candidate in usableSites: self.log.verbose( '%s is a candidate site for job %s but not in mask' % ( candidate, job ) ) else: sites.append( candidate ) self.log.info( 'Candidate sites in Mask are %s' % ( sites ) ) return S_OK( sites )
def getSiteMask(self, printOutput=False): """Retrieve current site mask from WMS Administrator service. Example usage: >>> print diracAdmin.getSiteMask() {'OK': True, 'Value': 0L} :returns: S_OK,S_ERROR """ siteStatus = SiteStatus() result = siteStatus.getUsableSites('ComputingAccess') if result['OK']: sites = result['Value'] if printOutput: sites.sort() for site in sites: print site return result
def getSiteMask( self, printOutput = False ): """Retrieve current site mask from WMS Administrator service. Example usage: >>> print diracAdmin.getSiteMask() {'OK': True, 'Value': 0L} :returns: S_OK,S_ERROR """ siteStatus = SiteStatus() result = siteStatus.getUsableSites( 'ComputingAccess' ) if result['OK']: sites = result['Value'] if printOutput: sites.sort() for site in sites: print site return result
def __checkSitesInMask(self, job, siteCandidates): """Returns list of site candidates that are in current mask. """ siteStatus = SiteStatus() result = siteStatus.getUsableSites('ComputingAccess') if not result['OK']: return S_ERROR('Could not get site mask') sites = [] usableSites = result['Value'] for candidate in siteCandidates: if not candidate in usableSites: self.log.verbose( '%s is a candidate site for job %s but not in mask' % (candidate, job)) else: sites.append(candidate) self.log.info('Candidate sites in Mask are %s' % (sites)) return S_OK(sites)
class Matcher(object): """ Logic for matching """ def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger("Matcher") self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper) self.siteClient = SiteStatus() def selectJob(self, resourceDescription, credDict): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self._getResourceDict(resourceDescription, credDict) # Make a nice print of the resource matching parameters toPrintDict = dict(resourceDict) if "MaxRAM" in resourceDescription: toPrintDict['MaxRAM'] = resourceDescription['MaxRAM'] if "NumberOfProcessors" in resourceDescription: toPrintDict['NumberOfProcessors'] = resourceDescription[ 'NumberOfProcessors'] toPrintDict['Tag'] = [] if "Tag" in resourceDict: for tag in resourceDict['Tag']: if not tag.endswith('GB') and not tag.endswith('Processors'): toPrintDict['Tag'].append(tag) if not toPrintDict['Tag']: toPrintDict.pop('Tag') gLogger.info('Resource description for matching', printDict(toPrintDict)) negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site']) result = self.tqDB.matchAndGetJob(resourceDict, negativeCond=negativeCond) if not result['OK']: raise RuntimeError(result['Message']) result = result['Value'] if not result['matchFound']: self.log.info("No match found") return {} jobID = result['jobId'] resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status']) if not resAtt['OK']: raise RuntimeError('Could not retrieve job attributes') if not resAtt['Value']: raise RuntimeError("No attributes returned for job") if not resAtt['Value']['Status'] == 'Waiting': self.log.error('Job matched by the TQ is not in Waiting state', str(jobID)) result = self.tqDB.deleteJob(jobID) if not result['OK']: raise RuntimeError(result['Message']) raise RuntimeError("Job %s is not in Waiting state" % str(jobID)) self._reportStatus(resourceDict, jobID) result = self.jobDB.getJobJDL(jobID) if not result['OK']: raise RuntimeError("Failed to get the job JDL") resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime self.log.info("Match time: [%s]" % str(matchTime)) gMonitor.addMark("matchTime", matchTime) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters(jobID) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup']) if not resAtt['OK']: raise RuntimeError('Could not retrieve job attributes') if not resAtt['Value']: raise RuntimeError('No attributes returned for job') if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True): self.limiter.updateDelayCounters(resourceDict['Site'], jobID) pilotInfoReportedFlag = resourceDict.get('PilotInfoReportedFlag', False) if not pilotInfoReportedFlag: self._updatePilotInfo(resourceDict) self._updatePilotJobMapping(resourceDict, jobID) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = True return resultDict def _getResourceDict(self, resourceDescription, credDict): """ from resourceDescription to resourceDict (just various mods) """ resourceDict = self._processResourceDescription(resourceDescription) resourceDict = self._checkCredentials(resourceDict, credDict) self._checkPilotVersion(resourceDict) if not self._checkMask(resourceDict): # Banned destinations can only take Test jobs resourceDict['JobType'] = 'Test' self.log.verbose("Resource description:") for key in resourceDict: self.log.verbose("%s : %s" % (key.rjust(20), resourceDict[key])) return resourceDict def _processResourceDescription(self, resourceDescription): """ Check and form the resource description dictionary resourceDescription is a ceDict coming from a JobAgent, for example. """ resourceDict = {} for name in singleValueDefFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] for name in multiValueMatchFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] for name in tagMatchFields: if name in resourceDescription and resourceDescription[name]: resourceDict[name] = resourceDescription[name] rname = 'Required%s' % name if rname in resourceDescription: resourceDict[rname] = resourceDescription[rname] if 'JobID' in resourceDescription: resourceDict['JobID'] = resourceDescription['JobID'] # Convert MaxRAM and NumberOfProcessors parameters into a list of tags maxRAM = resourceDescription.get('MaxRAM') if maxRAM: try: maxRAM = int(maxRAM) / 1000 except ValueError: maxRAM = None nProcessors = resourceDescription.get('NumberOfProcessors') if nProcessors: try: nProcessors = int(nProcessors) except ValueError: nProcessors = None for param, key in [(maxRAM, 'GB'), (nProcessors, 'Processors')]: if param and param <= 128: paramList = range(2, param + 1) paramTags = ['%d%s' % (par, key) for par in paramList] if paramTags: resourceDict.setdefault("Tag", []).extend(paramTags) if "WholeNode" in resourceDescription: resourceDict.setdefault("Tag", []).append("WholeNode") if 'Tag' in resourceDict: resourceDict['Tag'] = list(set(resourceDict['Tag'])) for k in ('DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization', 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag'): if k in resourceDescription: resourceDict[k] = resourceDescription[k] return resourceDict def _reportStatus(self, resourceDict, jobID): """ Reports the status of the matched job in jobDB and jobLoggingDB Do not fail if errors happen here """ attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site'] attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']] result = self.jobDB.setJobAttributes(jobID, attNames, attValues) if not result['OK']: self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % (jobID, result['Message'])) else: self.log.verbose("Set job attributes for jobID %s" % jobID) result = self.jlDB.addLoggingRecord(jobID, status='Matched', minor='Assigned', source='Matcher') if not result['OK']: self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % (jobID, result['Message'])) else: self.log.verbose("Added logging record for jobID %s" % jobID) def _checkMask(self, resourceDict): """ Check the mask: are we allowed to run normal jobs? FIXME: should we move to site OR SE? """ if 'Site' not in resourceDict: self.log.error("Missing Site Name in Resource JDL") raise RuntimeError("Missing Site Name in Resource JDL") # Check if site is allowed result = self.siteClient.getUsableSites(resourceDict['Site']) if not result['OK']: self.log.error("Internal error", "siteClient.getUsableSites: %s" % result['Message']) raise RuntimeError("Internal error") if resourceDict['Site'] not in result['Value']: return False return True def _updatePilotInfo(self, resourceDict): """ Update pilot information - do not fail if we don't manage to do it """ pilotReference = resourceDict.get('PilotReference', '') if pilotReference: gridCE = resourceDict.get('GridCE', 'Unknown') site = resourceDict.get('Site', 'Unknown') benchmark = resourceDict.get('PilotBenchmark', 0.0) self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % (pilotReference, gridCE, site, benchmark)) result = self.pilotAgentsDB.setPilotStatus(pilotReference, status='Running', gridSite=site, destination=gridCE, benchmark=benchmark) if not result['OK']: self.log.warn( "Problem updating pilot information", "; setPilotStatus. pilotReference: %s; %s" % (pilotReference, result['Message'])) def _updatePilotJobMapping(self, resourceDict, jobID): """ Update pilot to job mapping information """ pilotReference = resourceDict.get('PilotReference', '') if pilotReference: result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID) if not result['OK']: self.log.error( "Problem updating pilot information", ";setCurrentJobID. pilotReference: %s; %s" % (pilotReference, result['Message'])) result = self.pilotAgentsDB.setJobForPilot(jobID, pilotReference, updateStatus=False) if not result['OK']: self.log.error( "Problem updating pilot information", "; setJobForPilot. pilotReference: %s; %s" % (pilotReference, result['Message'])) def _checkCredentials(self, resourceDict, credDict): """ Check if we can get a job given the passed credentials """ if Properties.GENERIC_PILOT in credDict['properties']: # You can only match groups in the same VO if credDict['group'] == "hosts": # for the host case the VirtualOrganization parameter # is mandatory in resourceDict vo = resourceDict.get('VirtualOrganization', '') else: vo = Registry.getVOForGroup(credDict['group']) result = Registry.getGroupsForVO(vo) if result['OK']: resourceDict['OwnerGroup'] = result['Value'] else: raise RuntimeError(result['Message']) else: # If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict['properties']: self.log.notice( "Setting the resource DN to the credentials DN") resourceDict['OwnerDN'] = credDict['DN'] # If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict['properties']: resourceDict['OwnerGroup'] = credDict['group'] self.log.notice( "Setting the resource group to the credentials group") if 'OwnerDN' in resourceDict and resourceDict[ 'OwnerDN'] != credDict['DN']: ownerDN = resourceDict['OwnerDN'] result = Registry.getGroupsForDN(resourceDict['OwnerDN']) if not result['OK']: raise RuntimeError(result['Message']) if credDict['group'] not in result['Value']: # DN is not in the same group! bad boy. self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN) resourceDict['OwnerDN'] = credDict['DN'] # Nothing special, group and DN have to be the same else: resourceDict['OwnerDN'] = credDict['DN'] resourceDict['OwnerGroup'] = credDict['group'] return resourceDict def _checkPilotVersion(self, resourceDict): """ Check the pilot DIRAC version """ if self.opsHelper.getValue("Pilot/CheckVersion", True): if 'ReleaseVersion' not in resourceDict: if 'DIRACVersion' not in resourceDict: raise RuntimeError( 'Version check requested and not provided by Pilot') else: pilotVersion = resourceDict['DIRACVersion'] else: pilotVersion = resourceDict['ReleaseVersion'] validVersions = self.opsHelper.getValue("Pilot/Version", []) if validVersions and pilotVersion not in validVersions: raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % (pilotVersion, ",".join(validVersions))) # Check project if requested validProject = self.opsHelper.getValue("Pilot/Project", "") if validProject: if 'ReleaseProject' not in resourceDict: raise RuntimeError( "Version check requested but expected project %s not received" % validProject) if resourceDict['ReleaseProject'] != validProject: raise RuntimeError( "Version check requested \ but expected project %s != received %s" % (validProject, resourceDict['ReleaseProject']))
class Matcher(object): """ Logic for matching """ def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger("Matcher") self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper) self.siteClient = SiteStatus() def selectJob(self, resourceDescription, credDict): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self._getResourceDict(resourceDescription, credDict) # Make a nice print of the resource matching parameters toPrintDict = dict(resourceDict) if "MaxRAM" in resourceDescription: toPrintDict['MaxRAM'] = resourceDescription['MaxRAM'] if "NumberOfProcessors" in resourceDescription: toPrintDict['NumberOfProcessors'] = resourceDescription['NumberOfProcessors'] toPrintDict['Tag'] = [] if "Tag" in resourceDict: for tag in resourceDict['Tag']: if not tag.endswith('GB') and not tag.endswith('Processors'): toPrintDict['Tag'].append(tag) if not toPrintDict['Tag']: toPrintDict.pop('Tag') gLogger.info('Resource description for matching', printDict(toPrintDict)) negativeCond = self.limiter.getNegativeCondForSite(resourceDict['Site']) result = self.tqDB.matchAndGetJob(resourceDict, negativeCond=negativeCond) if not result['OK']: raise RuntimeError(result['Message']) result = result['Value'] if not result['matchFound']: self.log.info("No match found") return {} jobID = result['jobId'] resAtt = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup', 'Status']) if not resAtt['OK']: raise RuntimeError('Could not retrieve job attributes') if not resAtt['Value']: raise RuntimeError("No attributes returned for job") if not resAtt['Value']['Status'] == 'Waiting': self.log.error('Job matched by the TQ is not in Waiting state', str(jobID)) result = self.tqDB.deleteJob(jobID) if not result['OK']: raise RuntimeError(result['Message']) raise RuntimeError("Job %s is not in Waiting state" % str(jobID)) self._reportStatus(resourceDict, jobID) result = self.jobDB.getJobJDL(jobID) if not result['OK']: raise RuntimeError("Failed to get the job JDL") resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime self.log.info("Match time: [%s]" % str(matchTime)) gMonitor.addMark("matchTime", matchTime) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters(jobID) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup']) if not resAtt['OK']: raise RuntimeError('Could not retrieve job attributes') if not resAtt['Value']: raise RuntimeError('No attributes returned for job') if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True): self.limiter.updateDelayCounters(resourceDict['Site'], jobID) pilotInfoReportedFlag = resourceDict.get('PilotInfoReportedFlag', False) if not pilotInfoReportedFlag: self._updatePilotInfo(resourceDict) self._updatePilotJobMapping(resourceDict, jobID) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = True return resultDict def _getResourceDict(self, resourceDescription, credDict): """ from resourceDescription to resourceDict (just various mods) """ resourceDict = self._processResourceDescription(resourceDescription) resourceDict = self._checkCredentials(resourceDict, credDict) self._checkPilotVersion(resourceDict) if not self._checkMask(resourceDict): # Banned destinations can only take Test jobs resourceDict['JobType'] = 'Test' self.log.verbose("Resource description:") for key in resourceDict: self.log.verbose("%s : %s" % (key.rjust(20), resourceDict[key])) return resourceDict def _processResourceDescription(self, resourceDescription): """ Check and form the resource description dictionary :param resourceDescription: a ceDict coming from a JobAgent, for example. :return: updated dictionary of resource description parameters """ resourceDict = {} for name in singleValueDefFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] for name in multiValueMatchFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] if resourceDescription.get('Tag'): resourceDict['Tag'] = resourceDescription['Tag'] if 'RequiredTag' in resourceDescription: resourceDict['RequiredTag'] = resourceDescription['RequiredTag'] if 'JobID' in resourceDescription: resourceDict['JobID'] = resourceDescription['JobID'] # Convert MaxRAM and NumberOfProcessors parameters into a list of tags maxRAM = resourceDescription.get('MaxRAM') if maxRAM: try: maxRAM = int(maxRAM) / 1000 except ValueError: maxRAM = None nProcessors = resourceDescription.get('NumberOfProcessors') if nProcessors: try: nProcessors = int(nProcessors) except ValueError: nProcessors = None for param, key in [(maxRAM, 'GB'), (nProcessors, 'Processors')]: if param and param <= 128: paramList = range(2, param + 1) paramTags = ['%d%s' % (par, key) for par in paramList] if paramTags: resourceDict.setdefault("Tag", []).extend(paramTags) # Add 'MultiProcessor' to the list of tags if nProcessors > 1: resourceDict.setdefault("Tag", []).append("MultiProcessor") # Add 'WholeNode' to the list of tags if "WholeNode" in resourceDescription: resourceDict.setdefault("Tag", []).append("WholeNode") if 'Tag' in resourceDict: resourceDict['Tag'] = list(set(resourceDict['Tag'])) for k in ('DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization', 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag'): if k in resourceDescription: resourceDict[k] = resourceDescription[k] return resourceDict def _reportStatus(self, resourceDict, jobID): """ Reports the status of the matched job in jobDB and jobLoggingDB Do not fail if errors happen here """ attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site'] attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']] result = self.jobDB.setJobAttributes(jobID, attNames, attValues) if not result['OK']: self.log.error("Problem reporting job status", "setJobAttributes, jobID = %s: %s" % (jobID, result['Message'])) else: self.log.verbose("Set job attributes for jobID %s" % jobID) result = self.jlDB.addLoggingRecord(jobID, status='Matched', minor='Assigned', source='Matcher') if not result['OK']: self.log.error("Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % (jobID, result['Message'])) else: self.log.verbose("Added logging record for jobID %s" % jobID) def _checkMask(self, resourceDict): """ Check the mask: are we allowed to run normal jobs? FIXME: should we move to site OR SE? """ if 'Site' not in resourceDict: self.log.error("Missing Site Name in Resource JDL") raise RuntimeError("Missing Site Name in Resource JDL") # Check if site is allowed result = self.siteClient.getUsableSites(resourceDict['Site']) if not result['OK']: self.log.error("Internal error", "siteClient.getUsableSites: %s" % result['Message']) raise RuntimeError("Internal error") if resourceDict['Site'] not in result['Value']: return False return True def _updatePilotInfo(self, resourceDict): """ Update pilot information - do not fail if we don't manage to do it """ pilotReference = resourceDict.get('PilotReference', '') if pilotReference: gridCE = resourceDict.get('GridCE', 'Unknown') site = resourceDict.get('Site', 'Unknown') benchmark = resourceDict.get('PilotBenchmark', 0.0) self.log.verbose('Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % (pilotReference, gridCE, site, benchmark)) result = self.pilotAgentsDB.setPilotStatus(pilotReference, status='Running', gridSite=site, destination=gridCE, benchmark=benchmark) if not result['OK']: self.log.warn("Problem updating pilot information", "; setPilotStatus. pilotReference: %s; %s" % (pilotReference, result['Message'])) def _updatePilotJobMapping(self, resourceDict, jobID): """ Update pilot to job mapping information """ pilotReference = resourceDict.get('PilotReference', '') if pilotReference: result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID) if not result['OK']: self.log.error("Problem updating pilot information", ";setCurrentJobID. pilotReference: %s; %s" % (pilotReference, result['Message'])) result = self.pilotAgentsDB.setJobForPilot(jobID, pilotReference, updateStatus=False) if not result['OK']: self.log.error("Problem updating pilot information", "; setJobForPilot. pilotReference: %s; %s" % (pilotReference, result['Message'])) def _checkCredentials(self, resourceDict, credDict): """ Check if we can get a job given the passed credentials """ if Properties.GENERIC_PILOT in credDict['properties']: # You can only match groups in the same VO if credDict['group'] == "hosts": # for the host case the VirtualOrganization parameter # is mandatory in resourceDict vo = resourceDict.get('VirtualOrganization', '') else: vo = Registry.getVOForGroup(credDict['group']) if 'OwnerGroup' not in resourceDict: result = Registry.getGroupsForVO(vo) if result['OK']: resourceDict['OwnerGroup'] = result['Value'] else: raise RuntimeError(result['Message']) else: # If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict['properties']: self.log.notice("Setting the resource DN to the credentials DN") resourceDict['OwnerDN'] = credDict['DN'] # If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict['properties']: resourceDict['OwnerGroup'] = credDict['group'] self.log.notice("Setting the resource group to the credentials group") if 'OwnerDN' in resourceDict and resourceDict['OwnerDN'] != credDict['DN']: ownerDN = resourceDict['OwnerDN'] result = Registry.getGroupsForDN(resourceDict['OwnerDN']) if not result['OK']: raise RuntimeError(result['Message']) if credDict['group'] not in result['Value']: # DN is not in the same group! bad boy. self.log.notice("You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN) resourceDict['OwnerDN'] = credDict['DN'] # Nothing special, group and DN have to be the same else: resourceDict['OwnerDN'] = credDict['DN'] resourceDict['OwnerGroup'] = credDict['group'] return resourceDict def _checkPilotVersion(self, resourceDict): """ Check the pilot DIRAC version """ if self.opsHelper.getValue("Pilot/CheckVersion", True): if 'ReleaseVersion' not in resourceDict: if 'DIRACVersion' not in resourceDict: raise RuntimeError('Version check requested and not provided by Pilot') else: pilotVersion = resourceDict['DIRACVersion'] else: pilotVersion = resourceDict['ReleaseVersion'] validVersions = self.opsHelper.getValue("Pilot/Version", []) if validVersions and pilotVersion not in validVersions: raise RuntimeError('Pilot version does not match the production version %s not in ( %s )' % (pilotVersion, ",".join(validVersions))) # Check project if requested validProject = self.opsHelper.getValue("Pilot/Project", "") if validProject: if 'ReleaseProject' not in resourceDict: raise RuntimeError("Version check requested but expected project %s not received" % validProject) if resourceDict['ReleaseProject'] != validProject: raise RuntimeError("Version check requested \ but expected project %s != received %s" % (validProject, resourceDict['ReleaseProject']))
class CloudDirector(AgentModule): """The CloudDirector works like a SiteDirector for cloud sites: It looks at the queued jobs in the task queues and attempts to start VM instances to meet the current demand. """ def __init__(self, *args, **kwargs): super(CloudDirector, self).__init__(*args, **kwargs) self.vmTypeDict = {} self.vmTypeCECache = {} self.vmTypeSlots = {} self.failedVMTypes = defaultdict(int) self.firstPass = True self.vo = "" self.group = "" # self.voGroups contain all the eligible user groups for clouds submitted by this SiteDirector self.voGroups = [] self.cloudDN = "" self.cloudGroup = "" self.platforms = [] self.sites = [] self.siteClient = None self.proxy = None self.updateStatus = True self.getOutput = False self.sendAccounting = True def initialize(self): self.siteClient = SiteStatus() return S_OK() def beginExecution(self): # The Director is for a particular user community self.vo = self.am_getOption("VO", "") if not self.vo: self.vo = CSGlobals.getVO() # The SiteDirector is for a particular user group self.group = self.am_getOption("Group", "") # Choose the group for which clouds will be submitted. This is a hack until # we will be able to match clouds to VOs. if not self.group: if self.vo: result = Registry.getGroupsForVO(self.vo) if not result["OK"]: return result self.voGroups = [] for group in result["Value"]: if "NormalUser" in Registry.getPropertiesForGroup(group): self.voGroups.append(group) else: self.voGroups = [self.group] result = findGenericCloudCredentials(vo=self.vo) if not result["OK"]: return result self.cloudDN, self.cloudGroup = result["Value"] self.maxVMsToSubmit = self.am_getOption("MaxVMsToSubmit", 1) self.runningPod = self.am_getOption("RunningPod", self.vo) # Get the site description dictionary siteNames = None if not self.am_getOption("Site", "Any").lower() == "any": siteNames = self.am_getOption("Site", []) if not siteNames: siteNames = None ces = None if not self.am_getOption("CEs", "Any").lower() == "any": ces = self.am_getOption("CEs", []) if not ces: ces = None result = getVMTypes(vo=self.vo, siteList=siteNames) if not result["OK"]: return result resourceDict = result["Value"] result = self.getEndpoints(resourceDict) if not result["OK"]: return result # if not siteNames: # siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' ) # if siteName == 'Unknown': # return S_OK( 'No site specified for the SiteDirector' ) # else: # siteNames = [siteName] # self.siteNames = siteNames self.log.always("Sites:", siteNames) self.log.always("CEs:", ces) self.log.always("CloudDN:", self.cloudDN) self.log.always("CloudGroup:", self.cloudGroup) self.localhost = socket.getfqdn() self.proxy = "" if self.firstPass: if self.vmTypeDict: self.log.always("Agent will serve VM types:") for vmType in self.vmTypeDict: self.log.always( "Site: %s, CE: %s, VMType: %s" % (self.vmTypeDict[vmType]["Site"], self.vmTypeDict[vmType]["CEName"], vmType) ) self.firstPass = False return S_OK() def __generateVMTypeHash(self, vmTypeDict): """Generate a hash of the queue description""" myMD5 = hashlib.md5() myMD5.update(str(sorted(vmTypeDict.items())).encode()) hexstring = myMD5.hexdigest() return hexstring def getEndpoints(self, resourceDict): """Get the list of relevant CEs and their descriptions""" self.vmTypeDict = {} ceFactory = EndpointFactory() result = getPilotBootstrapParameters(vo=self.vo, runningPod=self.runningPod) if not result["OK"]: return result opParameters = result["Value"] for site in resourceDict: for ce in resourceDict[site]: ceDict = resourceDict[site][ce] ceTags = ceDict.get("Tag", []) if isinstance(ceTags, six.string_types): ceTags = fromChar(ceTags) ceMaxRAM = ceDict.get("MaxRAM", None) qDict = ceDict.pop("VMTypes") for vmType in qDict: vmTypeName = "%s_%s" % (ce, vmType) self.vmTypeDict[vmTypeName] = {} self.vmTypeDict[vmTypeName]["ParametersDict"] = qDict[vmType] self.vmTypeDict[vmTypeName]["ParametersDict"]["VMType"] = vmType self.vmTypeDict[vmTypeName]["ParametersDict"]["Site"] = site self.vmTypeDict[vmTypeName]["ParametersDict"]["Setup"] = gConfig.getValue("/DIRAC/Setup", "unknown") self.vmTypeDict[vmTypeName]["ParametersDict"]["CPUTime"] = 99999999 vmTypeTags = self.vmTypeDict[vmTypeName]["ParametersDict"].get("Tag") if vmTypeTags and isinstance(vmTypeTags, six.string_types): vmTypeTags = fromChar(vmTypeTags) self.vmTypeDict[vmTypeName]["ParametersDict"]["Tag"] = vmTypeTags if ceTags: if vmTypeTags: allTags = list(set(ceTags + vmTypeTags)) self.vmTypeDict[vmTypeName]["ParametersDict"]["Tag"] = allTags else: self.vmTypeDict[vmTypeName]["ParametersDict"]["Tag"] = ceTags maxRAM = self.vmTypeDict[vmTypeName]["ParametersDict"].get("MaxRAM") maxRAM = ceMaxRAM if not maxRAM else maxRAM if maxRAM: self.vmTypeDict[vmTypeName]["ParametersDict"]["MaxRAM"] = maxRAM ceWholeNode = ceDict.get("WholeNode", "true") wholeNode = self.vmTypeDict[vmTypeName]["ParametersDict"].get("WholeNode", ceWholeNode) if wholeNode.lower() in ("yes", "true"): self.vmTypeDict[vmTypeName]["ParametersDict"].setdefault("Tag", []) self.vmTypeDict[vmTypeName]["ParametersDict"]["Tag"].append("WholeNode") platform = "" if "Platform" in self.vmTypeDict[vmTypeName]["ParametersDict"]: platform = self.vmTypeDict[vmTypeName]["ParametersDict"]["Platform"] elif "Platform" in ceDict: platform = ceDict["Platform"] if platform and platform not in self.platforms: self.platforms.append(platform) if "Platform" not in self.vmTypeDict[vmTypeName]["ParametersDict"] and platform: result = Resources.getDIRACPlatform(platform) if result["OK"]: self.vmTypeDict[vmTypeName]["ParametersDict"]["Platform"] = result["Value"][0] ceVMTypeDict = dict(ceDict) ceVMTypeDict["CEName"] = ce ceVMTypeDict["VO"] = self.vo ceVMTypeDict["VMType"] = vmType ceVMTypeDict["RunningPod"] = self.runningPod ceVMTypeDict["CSServers"] = gConfig.getValue("/DIRAC/Configuration/Servers", []) ceVMTypeDict.update(self.vmTypeDict[vmTypeName]["ParametersDict"]) # Allow a resource-specifc CAPath to be set (as some clouds have their own CAs) # Otherwise fall back to the system-wide default(s) if "CAPath" not in ceVMTypeDict: ceVMTypeDict["CAPath"] = gConfig.getValue( "/DIRAC/Security/CAPath", "/opt/dirac/etc/grid-security/certificates/cas.pem" ) # Generate the CE object for the vmType or pick the already existing one # if the vmType definition did not change vmTypeHash = self.__generateVMTypeHash(ceVMTypeDict) if vmTypeName in self.vmTypeCECache and self.vmTypeCECache[vmTypeName]["Hash"] == vmTypeHash: vmTypeCE = self.vmTypeCECache[vmTypeName]["CE"] else: result = ceFactory.getCEObject(parameters=ceVMTypeDict) if not result["OK"]: return result self.vmTypeCECache.setdefault(vmTypeName, {}) self.vmTypeCECache[vmTypeName]["Hash"] = vmTypeHash self.vmTypeCECache[vmTypeName]["CE"] = result["Value"] vmTypeCE = self.vmTypeCECache[vmTypeName]["CE"] vmTypeCE.setBootstrapParameters(opParameters) self.vmTypeDict[vmTypeName]["CE"] = vmTypeCE self.vmTypeDict[vmTypeName]["CEName"] = ce self.vmTypeDict[vmTypeName]["CEType"] = ceDict["CEType"] self.vmTypeDict[vmTypeName]["Site"] = site self.vmTypeDict[vmTypeName]["VMType"] = vmType self.vmTypeDict[vmTypeName]["Platform"] = platform self.vmTypeDict[vmTypeName]["MaxInstances"] = ceDict["MaxInstances"] if not self.vmTypeDict[vmTypeName]["CE"].isValid(): self.log.error("Failed to instantiate CloudEndpoint for %s" % vmTypeName) continue if site not in self.sites: self.sites.append(site) return S_OK() def execute(self): """Main execution method""" if not self.vmTypeDict: self.log.warn("No site defined, exiting the cycle") return S_OK() result = self.createVMs() if not result["OK"]: self.log.error("Errors in the job submission: ", result["Message"]) # cyclesDone = self.am_getModuleParam( 'cyclesDone' ) # if self.updateStatus and cyclesDone % self.cloudStatusUpdateCycleFactor == 0: # result = self.updatePilotStatus() # if not result['OK']: # self.log.error( 'Errors in updating cloud status: ', result['Message'] ) return S_OK() def createVMs(self): """Go through defined computing elements and submit jobs if necessary""" vmTypeList = list(self.vmTypeDict.keys()) # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {"Setup": setup, "CPUTime": 9999999} if self.vo: tqDict["VO"] = self.vo if self.voGroups: tqDict["OwnerGroup"] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result["OK"]: return result tqDict["Platform"] = result["Value"] tqDict["Site"] = self.sites tags = [] for vmType in vmTypeList: if "Tag" in self.vmTypeDict[vmType]["ParametersDict"]: tags += self.vmTypeDict[vmType]["ParametersDict"]["Tag"] tqDict["Tag"] = list(set(tags)) self.log.verbose("Checking overall TQ availability with requirements") self.log.verbose(tqDict) matcherClient = MatcherClient() result = matcherClient.getMatchingTaskQueues(tqDict) if not result["OK"]: return result if not result["Value"]: self.log.verbose("No Waiting jobs suitable for the director") return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result["Value"]: if "Sites" in result["Value"][tqID]: for site in result["Value"][tqID]["Sites"]: if site.lower() != "any": jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result["Value"][tqID]: if "Sites" in result["Value"][tqID]: for site in result["Value"][tqID]["Sites"]: if site.lower() != "any": testSites.add(site) totalWaitingJobs += result["Value"][tqID]["Jobs"] tqIDList = list(result["Value"].keys()) result = virtualMachineDB.getInstanceCounters("Status", {}) totalVMs = 0 if result["OK"]: for status in result["Value"]: if status in ["New", "Submitted", "Running"]: totalVMs += result["Value"][status] self.log.info("Total %d jobs in %d task queues with %d VMs" % (totalWaitingJobs, len(tqIDList), totalVMs)) # Check if the site is allowed in the mask result = self.siteClient.getUsableSites() if not result["OK"]: return S_ERROR("Can not get the site mask") siteMaskList = result.get("Value", []) vmTypeList = list(self.vmTypeDict.keys()) random.shuffle(vmTypeList) totalSubmittedPilots = 0 matchedQueues = 0 for vmType in vmTypeList: ce = self.vmTypeDict[vmType]["CE"] ceName = self.vmTypeDict[vmType]["CEName"] vmTypeName = self.vmTypeDict[vmType]["VMType"] siteName = self.vmTypeDict[vmType]["Site"] platform = self.vmTypeDict[vmType]["Platform"] vmTypeTags = self.vmTypeDict[vmType]["ParametersDict"].get("Tag", []) siteMask = siteName in siteMaskList endpoint = "%s::%s" % (siteName, ceName) maxInstances = int(self.vmTypeDict[vmType]["MaxInstances"]) processorTags = [] # vms support WholeNode naturally processorTags.append("WholeNode") if not anySite and siteName not in jobSites: self.log.verbose("Skipping queue %s at %s: no workload expected" % (vmTypeName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose("Skipping queue %s: site %s not in the mask" % (vmTypeName, siteName)) continue if "CPUTime" in self.vmTypeDict[vmType]["ParametersDict"]: vmTypeCPUTime = int(self.vmTypeDict[vmType]["ParametersDict"]["CPUTime"]) else: self.log.warn("CPU time limit is not specified for queue %s, skipping..." % vmType) continue # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() if not siteMask: ceDict["JobType"] = "Test" if self.vo: ceDict["VO"] = self.vo if self.voGroups: ceDict["OwnerGroup"] = self.voGroups result = Resources.getCompatiblePlatforms(platform) if not result["OK"]: continue ceDict["Platform"] = result["Value"] ceDict["Tag"] = list(set(processorTags + vmTypeTags)) # Get the number of eligible jobs for the target site/queue result = matcherClient.getMatchingTaskQueues(ceDict) if not result["OK"]: self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"]) return result taskQueueDict = result["Value"] if not taskQueueDict: self.log.verbose("No matching TQs found for %s" % vmType) continue matchedQueues += 1 totalTQJobs = 0 tqIDList = list(taskQueueDict.keys()) for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]["Jobs"] self.log.verbose( "%d job(s) from %d task queue(s) are eligible for %s queue" % (totalTQJobs, len(tqIDList), vmType) ) # Get the number of already instantiated VMs for these task queues totalWaitingVMs = 0 result = virtualMachineDB.getInstanceCounters("Status", {"Endpoint": endpoint}) if result["OK"]: for status in result["Value"]: if status in ["New", "Submitted"]: totalWaitingVMs += result["Value"][status] if totalWaitingVMs >= totalTQJobs: self.log.verbose("%d VMs already for all the available jobs" % totalWaitingVMs) self.log.verbose("%d VMs for the total of %d eligible jobs for %s" % (totalWaitingVMs, totalTQJobs, vmType)) # Get proxy to be used to connect to the cloud endpoint authType = ce.parameters.get("Auth") if authType and authType.lower() in ["x509", "voms"]: self.log.verbose("Getting cloud proxy for %s/%s" % (siteName, ceName)) result = getProxyFileForCloud(ce) if not result["OK"]: continue ce.setProxy(result["Value"]) # Get the number of available slots on the target site/endpoint totalSlots = self.getVMInstances(endpoint, maxInstances) if totalSlots == 0: self.log.debug("%s: No slots available" % vmType) continue vmsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingVMs)) self.log.info( "%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d" % (vmType, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit) ) # Limit the number of VM instances to create to vmsToSubmit vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit) if vmsToSubmit == 0: continue self.log.info("Going to submit %d VMs to %s queue" % (vmsToSubmit, vmType)) result = ce.createInstances(vmsToSubmit) # result = S_OK() if not result["OK"]: self.log.error("Failed submission to queue %s:\n" % vmType, result["Message"]) self.failedVMTypes.setdefault(vmType, 0) self.failedVMTypes[vmType] += 1 continue # Add VMs to the VirtualMachineDB vmDict = result["Value"] totalSubmittedPilots += len(vmDict) self.log.info("Submitted %d VMs to %s@%s" % (len(vmDict), vmTypeName, ceName)) pilotList = [] for uuID in vmDict: diracUUID = vmDict[uuID]["InstanceID"] endpoint = "%s::%s" % (self.vmTypeDict[vmType]["Site"], ceName) result = virtualMachineDB.insertInstance(uuID, vmTypeName, diracUUID, endpoint, self.vo) if not result["OK"]: continue pRef = "vm://" + ceName + "/" + diracUUID + ":00" pilotList.append(pRef) stampDict = {} tqPriorityList = [] sumPriority = 0.0 for tq in taskQueueDict: sumPriority += taskQueueDict[tq]["Priority"] tqPriorityList.append((tq, sumPriority)) tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if tqID not in tqDict: tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference(pilotList, tqID, "", "", self.localhost, "Cloud", stampDict) if not result["OK"]: self.log.error("Failed to insert pilots into the PilotAgentsDB: %s" % result["Message"]) self.log.info( "%d VMs submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues) ) return S_OK() def getVMInstances(self, endpoint, maxInstances): result = virtualMachineDB.getInstanceCounters("Status", {"Endpoint": endpoint}) if not result["OK"]: return result count = 0 for status in result["Value"]: if status in ["New", "Submitted", "Running"]: count += int(result["Value"][status]) return max(0, maxInstances - count)
def _resolveCECandidates( self, taskQueueDict ): """ Return a list of CEs for this TaskQueue """ # assume user knows what they're doing and avoid site mask e.g. sam jobs if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']: self.log.info( 'CEs requested by TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( taskQueueDict['GridCEs'] ) ) return taskQueueDict['GridCEs'] # Get the mask siteStatus = SiteStatus() ret = siteStatus.getUsableSites( 'ComputingAccess' ) if not ret['OK']: self.log.error( 'Can not retrieve site Mask from DB:', ret['Message'] ) return [] usableSites = ret['Value'] if not usableSites: self.log.error( 'Site mask is empty' ) return [] self.log.verbose( 'Site Mask: %s' % ', '.join( usableSites ) ) # remove banned sites from siteMask if 'BannedSites' in taskQueueDict: for site in taskQueueDict['BannedSites']: if site in usableSites: usableSites.remove( site ) self.log.verbose( 'Removing banned site %s from site Mask' % site ) # remove from the mask if a Site is given siteMask = [ site for site in usableSites if 'Sites' not in taskQueueDict or site in taskQueueDict['Sites'] ] if not siteMask: # pilot can not be submitted self.log.info( 'No Valid Site Candidate in Mask for TaskQueue %s' % taskQueueDict['TaskQueueID'] ) return [] self.log.info( 'Site Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( siteMask ) ) # Get CE's associates to the given site Names ceMask = [] resources = Resources( vo = self.virtualOrganization ) result = resources.getEligibleResources( 'Computing', {'Site':siteMask, 'SubmissionMode':'gLite', 'CEType':['LCG','CREAM']} ) if not result['OK']: self.log.error( "Failed to get eligible ce's:", result['Message'] ) return [] ces = result['Value'] for ce in ces: ceHost = resources.getComputingElementValue( ce, 'Host', 'unknown' ) if ceHost != 'unknown': ceMask.append( ceHost ) if not ceMask: self.log.info( 'No CE Candidate found for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( siteMask ) ) self.log.verbose( 'CE Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( ceMask ) ) return ceMask
def _resolveCECandidates(self, taskQueueDict): """ Return a list of CEs for this TaskQueue """ # assume user knows what they're doing and avoid site mask e.g. sam jobs if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']: self.log.info( 'CEs requested by TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join(taskQueueDict['GridCEs'])) return taskQueueDict['GridCEs'] # Get the mask siteStatus = SiteStatus() ret = siteStatus.getUsableSites('ComputingAccess') if not ret['OK']: self.log.error('Can not retrieve site Mask from DB:', ret['Message']) return [] usableSites = ret['Value'] if not usableSites: self.log.error('Site mask is empty') return [] self.log.verbose('Site Mask: %s' % ', '.join(usableSites)) # remove banned sites from siteMask if 'BannedSites' in taskQueueDict: for site in taskQueueDict['BannedSites']: if site in usableSites: usableSites.remove(site) self.log.verbose('Removing banned site %s from site Mask' % site) # remove from the mask if a Site is given siteMask = [ site for site in usableSites if 'Sites' not in taskQueueDict or site in taskQueueDict['Sites'] ] if not siteMask: # pilot can not be submitted self.log.info('No Valid Site Candidate in Mask for TaskQueue %s' % taskQueueDict['TaskQueueID']) return [] self.log.info( 'Site Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join(siteMask)) # Get CE's associates to the given site Names ceMask = [] resources = Resources(vo=self.virtualOrganization) result = resources.getEligibleResources( 'Computing', { 'Site': siteMask, 'SubmissionMode': 'gLite', 'CEType': ['LCG', 'CREAM'] }) if not result['OK']: self.log.error("Failed to get eligible ce's:", result['Message']) return [] ces = result['Value'] for ce in ces: ceHost = resources.getComputingElementValue(ce, 'Host', 'unknown') if ceHost != 'unknown': ceMask.append(ceHost) if not ceMask: self.log.info( 'No CE Candidate found for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join(siteMask)) self.log.verbose( 'CE Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join(ceMask)) return ceMask
def optimizeJob( self, jid, jobState ): # Reschedule delay result = jobState.getAttributes( [ 'RescheduleCounter', 'RescheduleTime', 'ApplicationStatus' ] ) if not result[ 'OK' ]: return result attDict = result[ 'Value' ] try: reschedules = int( attDict[ 'RescheduleCounter' ] ) except ValueError: return S_ERROR( "RescheduleCounter has to be an integer" ) if reschedules != 0: delays = self.ex_getOption( 'RescheduleDelays', [60, 180, 300, 600] ) delay = delays[ min( reschedules, len( delays ) - 1 ) ] waited = toEpoch() - toEpoch( fromString( attDict[ 'RescheduleTime' ] ) ) if waited < delay: return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay ) # Get site requirements result = self._getSitesRequired( jobState ) if not result[ 'OK' ]: return result userSites, userBannedSites = result[ 'Value' ] # Get active and banned sites from DIRAC siteStatus = SiteStatus() result = siteStatus.getUsableSites( 'ComputingAccess' ) if not result[ 'OK' ]: return S_ERROR( "Cannot retrieve active sites from JobDB" ) usableSites = result[ 'Value' ] result = siteStatus.getUnusableSites( 'ComputingAccess' ) if not result[ 'OK' ]: return S_ERROR( "Cannot retrieve banned sites from JobDB" ) unusableSites = result[ 'Value' ] # If the user has selected any site, filter them and hold the job if not able to run if userSites: result = jobState.getAttribute( "JobType" ) if not result[ 'OK' ]: return S_ERROR( "Could not retrieve job type" ) jobType = result[ 'Value' ] if jobType not in self.ex_getOption( 'ExcludedOnHoldJobTypes', [] ): sites = self._applySiteFilter( userSites, usableSites, unusableSites ) if not sites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( userSites ) ) # Get the Input data # Third, check if there is input data result = jobState.getInputData() if not result['OK']: self.jobLog.error( "Cannot get input data %s" % ( result['Message'] ) ) return S_ERROR( 'Failed to get input data from JobDB' ) if not result['Value']: # No input data? Generate requirements and next return self.__sendToTQ( jobState, userSites, userBannedSites ) inputData = result[ 'Value' ] self.jobLog.verbose( 'Has an input data requirement' ) idAgent = self.ex_getOption( 'InputDataAgent', 'InputData' ) result = self.retrieveOptimizerParam( idAgent ) if not result['OK']: self.jobLog.error( "Could not retrieve input data info: %s" % result[ 'Message' ] ) return S_ERROR( "File Catalog Access Failure" ) opData = result[ 'Value' ] if 'SiteCandidates' not in opData: return S_ERROR( "No possible site candidates" ) # Filter input data sites with user requirement siteCandidates = list( opData[ 'SiteCandidates' ] ) self.jobLog.info( "Site candidates are %s" % siteCandidates ) siteCandidates = self._applySiteFilter( siteCandidates, userSites, userBannedSites ) if not siteCandidates: return S_ERROR( "Impossible InputData * Site requirements" ) idSites = {} for site in siteCandidates: idSites[ site ] = opData[ 'SiteCandidates' ][ site ] #Check if sites have correct count of disk+tape replicas numData = len( inputData ) errorSites = set() for site in idSites: if numData != idSites[ site ][ 'disk' ] + idSites[ site ][ 'tape' ]: self.jobLog.error( "Site candidate %s does not have all the input data" % site ) errorSites.add( site ) for site in errorSites: idSites.pop( site ) if not idSites: return S_ERROR( "Site candidates do not have all the input data" ) #Check if staging is required stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites ) if not siteCandidates: return S_ERROR( "No destination sites available" ) # Is any site active? stageSites = self._applySiteFilter( siteCandidates, usableSites, unusableSites ) if not stageSites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( siteCandidates ) ) # If no staging is required send to TQ if not stageRequired: # Use siteCandidates and not stageSites because active and banned sites # will be taken into account on matching time return self.__sendToTQ( jobState, siteCandidates, userBannedSites ) # Check if the user is allowed to stage if self.ex_getOption( "RestrictDataStage", False ): if not self.__checkStageAllowed( jobState ): return S_ERROR( "Stage not allowed" ) # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites stageSite = stageSites[0] self.jobLog.verbose( " Staging site will be %s" % ( stageSite ) ) stageData = idSites[ stageSite ] # Set as if everything has already been staged stageData[ 'disk' ] += stageData[ 'tape' ] stageData[ 'tape' ] = 0 # Set the site info back to the original dict to save afterwards opData[ 'SiteCandidates' ][ stageSite ] = stageData result = self.__requestStaging( jobState, stageSite, opData ) if not result[ 'OK' ]: return result stageLFNs = result[ 'Value' ] self._updateSharedSESites( stageSite, stageLFNs, opData ) # Save the optimizer data again self.jobLog.verbose( 'Updating %s Optimizer Info:' % ( idAgent ), opData ) result = self.storeOptimizerParam( idAgent, opData ) if not result[ 'OK' ]: return result return self._setJobSite( jobState, stageSites )
class Matcher(object): """Logic for matching""" def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None, pilotRef=None): """c'tor""" if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() if pilotRef: self.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) self.pilotAgentsDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) self.jobDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) self.tqDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) self.jlDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) else: self.log = gLogger.getSubLogger("Matcher") self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper, pilotRef=pilotRef) self.siteClient = SiteStatus() def selectJob(self, resourceDescription, credDict): """Main job selection function to find the highest priority job matching the resource capacity""" startTime = time.time() resourceDict = self._getResourceDict(resourceDescription, credDict) # Make a nice print of the resource matching parameters toPrintDict = dict(resourceDict) if "MaxRAM" in resourceDescription: toPrintDict["MaxRAM"] = resourceDescription["MaxRAM"] if "NumberOfProcessors" in resourceDescription: toPrintDict["NumberOfProcessors"] = resourceDescription[ "NumberOfProcessors"] toPrintDict["Tag"] = [] if "Tag" in resourceDict: for tag in resourceDict["Tag"]: if not tag.endswith("GB") and not tag.endswith("Processors"): toPrintDict["Tag"].append(tag) if not toPrintDict["Tag"]: toPrintDict.pop("Tag") self.log.info("Resource description for matching", printDict(toPrintDict)) negativeCond = self.limiter.getNegativeCondForSite( resourceDict["Site"], resourceDict.get("GridCE")) result = self.tqDB.matchAndGetJob(resourceDict, negativeCond=negativeCond) if not result["OK"]: raise RuntimeError(result["Message"]) result = result["Value"] if not result["matchFound"]: self.log.info("No match found") return {} jobID = result["jobId"] resAtt = self.jobDB.getJobAttributes( jobID, ["OwnerDN", "OwnerGroup", "Status"]) if not resAtt["OK"]: raise RuntimeError("Could not retrieve job attributes") if not resAtt["Value"]: raise RuntimeError("No attributes returned for job") if not resAtt["Value"]["Status"] == "Waiting": self.log.error("Job matched by the TQ is not in Waiting state", str(jobID)) result = self.tqDB.deleteJob(jobID) if not result["OK"]: raise RuntimeError(result["Message"]) raise RuntimeError("Job %s is not in Waiting state" % str(jobID)) self._reportStatus(resourceDict, jobID) result = self.jobDB.getJobJDL(jobID) if not result["OK"]: raise RuntimeError("Failed to get the job JDL") resultDict = {} resultDict["JDL"] = result["Value"] resultDict["JobID"] = jobID matchTime = time.time() - startTime self.log.verbose("Match time", "[%s]" % str(matchTime)) gMonitor.addMark("matchTime", matchTime) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters(jobID) if resOpt["OK"]: for key, value in resOpt["Value"].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes(jobID, ["OwnerDN", "OwnerGroup"]) if not resAtt["OK"]: raise RuntimeError("Could not retrieve job attributes") if not resAtt["Value"]: raise RuntimeError("No attributes returned for job") if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True): self.limiter.updateDelayCounters(resourceDict["Site"], jobID) pilotInfoReportedFlag = resourceDict.get("PilotInfoReportedFlag", False) if not pilotInfoReportedFlag: self._updatePilotInfo(resourceDict) self._updatePilotJobMapping(resourceDict, jobID) resultDict["DN"] = resAtt["Value"]["OwnerDN"] resultDict["Group"] = resAtt["Value"]["OwnerGroup"] resultDict["PilotInfoReportedFlag"] = True return resultDict def _getResourceDict(self, resourceDescription, credDict): """from resourceDescription to resourceDict (just various mods)""" resourceDict = self._processResourceDescription(resourceDescription) resourceDict = self._checkCredentials(resourceDict, credDict) self._checkPilotVersion(resourceDict) if not self._checkMask(resourceDict): # Banned destinations can only take Test jobs resourceDict["JobType"] = "Test" self.log.verbose("Resource description") for key in resourceDict: self.log.debug("%s : %s" % (key.rjust(20), resourceDict[key])) return resourceDict def _processResourceDescription(self, resourceDescription): """Check and form the resource description dictionary :param resourceDescription: a ceDict coming from a JobAgent, for example. :return: updated dictionary of resource description parameters """ resourceDict = {} for name in singleValueDefFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] for name in multiValueMatchFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] if resourceDescription.get("Tag"): tags = resourceDescription["Tag"] resourceDict["Tag"] = (tags if isinstance(tags, list) else list( {tag.strip("\"' ") for tag in tags.strip("[]").split(",")})) if "RequiredTag" in resourceDescription: requiredTagsList = (list({ tag.strip("\"' ") for tag in resourceDescription["RequiredTag"].strip( "[]").split(",") }) if isinstance(resourceDescription["RequiredTag"], str) else resourceDescription["RequiredTag"]) resourceDict["RequiredTag"] = requiredTagsList if "JobID" in resourceDescription: resourceDict["JobID"] = resourceDescription["JobID"] # Convert MaxRAM and NumberOfProcessors parameters into a list of tags maxRAM = resourceDescription.get("MaxRAM") if maxRAM: try: maxRAM = int(maxRAM / 1000) except ValueError: maxRAM = None nProcessors = resourceDescription.get("NumberOfProcessors") if nProcessors: try: nProcessors = int(nProcessors) except ValueError: nProcessors = None for param, key in [(maxRAM, "GB"), (nProcessors, "Processors")]: if param and param <= 1024: paramList = list(range(2, param + 1)) paramTags = ["%d%s" % (par, key) for par in paramList] if paramTags: resourceDict.setdefault("Tag", []).extend(paramTags) # Add 'MultiProcessor' to the list of tags if nProcessors and nProcessors > 1: resourceDict.setdefault("Tag", []).append("MultiProcessor") # Add 'WholeNode' to the list of tags if "WholeNode" in resourceDescription: resourceDict.setdefault("Tag", []).append("WholeNode") if "Tag" in resourceDict: resourceDict["Tag"] = list(set(resourceDict["Tag"])) if "RequiredTag" in resourceDict: resourceDict["RequiredTag"] = list(set( resourceDict["RequiredTag"])) for k in ( "DIRACVersion", "ReleaseVersion", "ReleaseProject", "VirtualOrganization", "PilotReference", "PilotBenchmark", "PilotInfoReportedFlag", ): if k in resourceDescription: resourceDict[k] = resourceDescription[k] return resourceDict def _reportStatus(self, resourceDict, jobID): """Reports the status of the matched job in jobDB and jobLoggingDB Do not fail if errors happen here """ attNames = ["Status", "MinorStatus", "ApplicationStatus", "Site"] attValues = ["Matched", "Assigned", "Unknown", resourceDict["Site"]] result = self.jobDB.setJobAttributes(jobID, attNames, attValues) if not result["OK"]: self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % (jobID, result["Message"])) else: self.log.verbose("Set job attributes for jobID", jobID) result = self.jlDB.addLoggingRecord(jobID, status=JobStatus.MATCHED, minorStatus="Assigned", source="Matcher") if not result["OK"]: self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % (jobID, result["Message"])) else: self.log.verbose("Added logging record for jobID", jobID) def _checkMask(self, resourceDict): """Check the mask: are we allowed to run normal jobs? FIXME: should we move to site OR SE? """ if "Site" not in resourceDict: self.log.error("Missing Site Name in Resource JDL") raise RuntimeError("Missing Site Name in Resource JDL") # Check if site is allowed result = self.siteClient.getUsableSites(resourceDict["Site"]) if not result["OK"]: self.log.error("Internal error", "siteClient.getUsableSites: %s" % result["Message"]) raise RuntimeError("Internal error") if resourceDict["Site"] not in result["Value"]: return False return True def _updatePilotInfo(self, resourceDict): """Update pilot information - do not fail if we don't manage to do it""" pilotReference = resourceDict.get("PilotReference", "") if pilotReference and pilotReference != "Unknown": gridCE = resourceDict.get("GridCE", "Unknown") site = resourceDict.get("Site", "Unknown") benchmark = resourceDict.get("PilotBenchmark", 0.0) self.log.verbose( "Reporting pilot info", "for %s: gridCE=%s, site=%s, benchmark=%f" % (pilotReference, gridCE, site, benchmark), ) result = self.pilotAgentsDB.setPilotStatus( pilotReference, status=PilotStatus.RUNNING, gridSite=site, destination=gridCE, benchmark=benchmark) if not result["OK"]: self.log.warn( "Problem updating pilot information", "; setPilotStatus. pilotReference: %s; %s" % (pilotReference, result["Message"]), ) def _updatePilotJobMapping(self, resourceDict, jobID): """Update pilot to job mapping information""" pilotReference = resourceDict.get("PilotReference", "") if pilotReference and pilotReference != "Unknown": result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID) if not result["OK"]: self.log.error( "Problem updating pilot information", ";setCurrentJobID. pilotReference: %s; %s" % (pilotReference, result["Message"]), ) result = self.pilotAgentsDB.setJobForPilot(jobID, pilotReference, updateStatus=False) if not result["OK"]: self.log.error( "Problem updating pilot information", "; setJobForPilot. pilotReference: %s; %s" % (pilotReference, result["Message"]), ) def _checkCredentials(self, resourceDict, credDict): """Check if we can get a job given the passed credentials""" if Properties.GENERIC_PILOT in credDict["properties"]: # You can only match groups in the same VO if credDict["group"] == "hosts": # for the host case the VirtualOrganization parameter # is mandatory in resourceDict vo = resourceDict.get("VirtualOrganization", "") else: vo = Registry.getVOForGroup(credDict["group"]) if "OwnerGroup" not in resourceDict: result = Registry.getGroupsForVO(vo) if result["OK"]: resourceDict["OwnerGroup"] = result["Value"] else: raise RuntimeError(result["Message"]) else: # If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict["properties"]: self.log.notice( "Setting the resource DN to the credentials DN") resourceDict["OwnerDN"] = credDict["DN"] # If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict["properties"]: resourceDict["OwnerGroup"] = credDict["group"] self.log.notice( "Setting the resource group to the credentials group") if "OwnerDN" in resourceDict and resourceDict[ "OwnerDN"] != credDict["DN"]: ownerDN = resourceDict["OwnerDN"] result = Registry.getGroupsForDN(resourceDict["OwnerDN"]) if not result["OK"]: raise RuntimeError(result["Message"]) if credDict["group"] not in result["Value"]: # DN is not in the same group! bad boy. self.log.warn( "You cannot request jobs from this DN, as it does not belong to your group!", "(%s)" % ownerDN, ) resourceDict["OwnerDN"] = credDict["DN"] # Nothing special, group and DN have to be the same else: resourceDict["OwnerDN"] = credDict["DN"] resourceDict["OwnerGroup"] = credDict["group"] return resourceDict def _checkPilotVersion(self, resourceDict): """Check the pilot DIRAC version""" if self.opsHelper.getValue("Pilot/CheckVersion", True): if "ReleaseVersion" not in resourceDict: if "DIRACVersion" not in resourceDict: raise PilotVersionError( "Version check requested and not provided by Pilot") else: pilotVersion = resourceDict["DIRACVersion"] else: pilotVersion = resourceDict["ReleaseVersion"] validVersions = [ convertToPy3VersionNumber(newStyleVersion) for newStyleVersion in self.opsHelper.getValue( "Pilot/Version", []) ] if validVersions and convertToPy3VersionNumber( pilotVersion) not in validVersions: raise PilotVersionError( "Pilot version does not match the production version: %s not in ( %s )" % (pilotVersion, ",".join(validVersions))) # Check project if requested validProject = self.opsHelper.getValue("Pilot/Project", "") if validProject: if "ReleaseProject" not in resourceDict: raise PilotVersionError( "Version check requested but expected project %s not received" % validProject) if resourceDict["ReleaseProject"] != validProject: raise PilotVersionError( "Version check requested but expected project %s != received %s" % (validProject, resourceDict["ReleaseProject"]))
def checkJob( self, job, classAdJob ): """This method controls the checking of the job. """ self.log.verbose( 'Job %s will be processed' % ( job ) ) # Check if the job was recently rescheduled result = self.jobDB.getJobAttributes( job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'] ) if not result['OK']: self.log.error( result['Message'] ) return S_ERROR( 'Can not get job attributes from JobDB' ) jobDict = result['Value'] reCounter = int( jobDict['RescheduleCounter'] ) if reCounter != 0 : reTime = fromString( jobDict['RescheduleTime'] ) delta = toEpoch() - toEpoch( reTime ) delay = self.maxRescheduleDelay if reCounter <= len( self.rescheduleDelaysList ): delay = self.rescheduleDelaysList[reCounter - 1] if delta < delay: if jobDict['ApplicationStatus'].find( 'On Hold: after rescheduling' ) == -1: result = self.jobDB.setJobStatus( job, application = 'On Hold: after rescheduling #%d' % reCounter ) return S_OK() # First, get Site and BannedSites from the Job result = self.__getJobSiteRequirement( job, classAdJob ) userBannedSites = result['BannedSites'] userSites = result['Sites'] if userSites: userSites = applySiteRequirements( userSites, [], userBannedSites ) if not userSites: msg = 'Impossible Site Requirement' return S_ERROR( msg ) # Second, get the Active and Banned sites from the RSS siteStatus = SiteStatus() usableSites = siteStatus.getUsableSites( 'ComputingAccess' ) unusableSites = siteStatus.getUnusableSites( 'ComputingAccess' ) if not ( usableSites['OK'] and unusableSites['OK'] ): if not usableSites['OK']: self.log.error( usableSites['Message'] ) if not unusableSites['OK']: self.log.error( unusableSites['Message'] ) return S_ERROR( 'Can not get Active and Banned Sites from JobDB' ) usableSites = usableSites['Value'] unusableSites = unusableSites['Value'] if userSites: sites = applySiteRequirements( userSites, usableSites, unusableSites ) if not sites: # Put on Hold only non-excluded job types jobType = classAdJob.getAttributeString( 'JobType' ) if not jobType in self.excludedOnHoldJobTypes: msg = 'On Hold: Requested site is Banned or not Active' self.log.info( msg ) result = self.jobDB.setJobStatus( job, application = msg ) return S_OK() # Third, check if there is input data result = self.jobDB.getInputData( job ) if not result['OK']: self.log.warn( 'Failed to get input data from JobDB for %s' % ( job ) ) self.log.error( result['Message'] ) return S_ERROR( 'Failed to get input data from JobDB' ) if not result['Value']: return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites ) hasInputData = False inputData = [] for lfn in result['Value']: if lfn: inputData.append( lfn ) hasInputData = True if not hasInputData: #With no input data requirement, job can proceed directly to task queue self.log.verbose( 'Job %s has no input data requirement' % ( job ) ) return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites ) self.log.verbose( 'Job %s has an input data requirement ' % ( job ) ) # Fourth, Check all optimizer information result = self.__checkOptimizerInfo( job ) if not result['OK']: return result optInfo = result['Value'] #Compare site candidates with current mask optSites = optInfo['SiteCandidates'].keys() self.log.info( 'Input Data Site Candidates: %s' % ( ', '.join( optSites ) ) ) # Check that it is compatible with user requirements optSites = applySiteRequirements( optSites, userSites, userBannedSites ) if not optSites: msg = 'Impossible Site + InputData Requirement' return S_ERROR( msg ) sites = applySiteRequirements( optSites, usableSites, unusableSites ) if not sites: msg = 'On Hold: InputData Site is Banned or not Active' self.log.info( msg ) result = self.jobDB.setJobStatus( job, application = msg ) return S_OK() #Set stager request as necessary, optimize for smallest #files on tape if #more than one site candidate left at this point checkStaging = self.__resolveSitesForStaging( job, sites, inputData, optInfo['SiteCandidates'] ) if not checkStaging['OK']: return checkStaging destinationSites = checkStaging['SiteCandidates'] if not destinationSites: return S_ERROR( 'No destination sites available' ) stagingFlag = checkStaging['Value'] if stagingFlag: #Single site candidate chosen and staging required self.log.verbose( 'Job %s requires staging of input data' % ( job ) ) # set all LFN to disk for the selected site stagingSite = destinationSites[0] siteDict = optInfo['SiteCandidates'][stagingSite] siteDict['disk'] = siteDict['disk'] + siteDict['tape'] siteDict['tape'] = 0 optInfo['SiteCandidates'][stagingSite] = siteDict self.log.verbose( 'Updating %s Optimizer Info for Job %s:' % ( self.dataAgentName, job ), optInfo ) result = self.setOptimizerJobInfo( job, self.dataAgentName, optInfo ) if not result['OK']: return result # Site is selected for staging, report it self.log.verbose( 'Staging site candidate for job %s is %s' % ( job, stagingSite ) ) result = self.__getStagingSites( stagingSite, destinationSites ) if not result['OK']: stagingSites = [stagingSite] else: stagingSites = result['Value'] if len( stagingSites ) == 1: self.jobDB.setJobAttribute( job, 'Site', stagingSite ) else: # Get the name of the site group result = self.__getSiteGroup( stagingSites ) if result['OK']: groupName = result['Value'] if groupName: self.jobDB.setJobAttribute( job, 'Site', groupName ) else: self.jobDB.setJobAttribute( job, 'Site', 'Multiple' ) else: self.jobDB.setJobAttribute( job, 'Site', 'Multiple' ) stagerDict = self.__setStagingRequest( job, stagingSite, optInfo ) if not stagerDict['OK']: return stagerDict self.__updateOtherSites( job, stagingSite, stagerDict['Value'], optInfo ) return S_OK() else: #No staging required, can proceed to task queue agent and then waiting status self.log.verbose( 'Job %s does not require staging of input data' % ( job ) ) #Finally send job to TaskQueueAgent return self.__sendJobToTaskQueue( job, classAdJob, destinationSites, userBannedSites )
class MatcherHandler(RequestHandler): __opsCache = {} def initialize(self): self.__opsHelper = self.__getOpsHelper() self.__limiter = Limiter(self.__opsHelper) self.__siteStatus = SiteStatus() def __getOpsHelper(self, setup=False, vo=False): if not setup: setup = self.srv_getClientSetup() if not vo: vo = Registry.getVOForGroup(self.getRemoteCredentials()['group']) cKey = (vo, setup) if cKey not in MatcherHandler.__opsCache: MatcherHandler.__opsCache[cKey] = Operations.Operations( vo=vo, setup=setup) return MatcherHandler.__opsCache[cKey] def __processResourceDescription(self, resourceDescription): # Check and form the resource description dictionary resourceDict = {} if type(resourceDescription) in StringTypes: classAdAgent = ClassAd(resourceDescription) if not classAdAgent.isOK(): return S_ERROR('Illegal Resource JDL') gLogger.verbose(classAdAgent.asJDL()) for name in gTaskQueueDB.getSingleValueTQDefFields(): if classAdAgent.lookupAttribute(name): if name == 'CPUTime': resourceDict[name] = classAdAgent.getAttributeInt(name) else: resourceDict[name] = classAdAgent.getAttributeString( name) for name in gTaskQueueDB.getMultiValueMatchFields(): if classAdAgent.lookupAttribute(name): if name == 'SubmitPool': resourceDict[ name] = classAdAgent.getListFromExpression(name) else: resourceDict[name] = classAdAgent.getAttributeString( name) # Check if a JobID is requested if classAdAgent.lookupAttribute('JobID'): resourceDict['JobID'] = classAdAgent.getAttributeInt('JobID') for k in ('DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization'): if classAdAgent.lookupAttribute(k): resourceDict[k] = classAdAgent.getAttributeString(k) else: for name in gTaskQueueDB.getSingleValueTQDefFields(): if resourceDescription.has_key(name): resourceDict[name] = resourceDescription[name] for name in gTaskQueueDB.getMultiValueMatchFields(): if resourceDescription.has_key(name): resourceDict[name] = resourceDescription[name] if resourceDescription.has_key('JobID'): resourceDict['JobID'] = resourceDescription['JobID'] for k in ('DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization', 'PilotReference', 'PilotInfoReportedFlag', 'PilotBenchmark', 'LHCbPlatform'): if k in resourceDescription: resourceDict[k] = resourceDescription[k] return resourceDict def selectJob(self, resourceDescription): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self.__processResourceDescription(resourceDescription) credDict = self.getRemoteCredentials() #Check credentials if not generic pilot if Properties.GENERIC_PILOT in credDict['properties']: #You can only match groups in the same VO vo = Registry.getVOForGroup(credDict['group']) result = Registry.getGroupsForVO(vo) if result['OK']: resourceDict['OwnerGroup'] = result['Value'] else: #If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict['properties']: gLogger.notice("Setting the resource DN to the credentials DN") resourceDict['OwnerDN'] = credDict['DN'] #If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict['properties']: resourceDict['OwnerGroup'] = credDict['group'] gLogger.notice( "Setting the resource group to the credentials group") if 'OwnerDN' in resourceDict and resourceDict[ 'OwnerDN'] != credDict['DN']: ownerDN = resourceDict['OwnerDN'] result = Registry.getGroupsForDN(resourceDict['OwnerDN']) if not result['OK'] or credDict['group'] not in result[ 'Value']: #DN is not in the same group! bad boy. gLogger.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN) resourceDict['OwnerDN'] = credDict['DN'] #Nothing special, group and DN have to be the same else: resourceDict['OwnerDN'] = credDict['DN'] resourceDict['OwnerGroup'] = credDict['group'] # Check the pilot DIRAC version if self.__opsHelper.getValue("Pilot/CheckVersion", True): if 'ReleaseVersion' not in resourceDict: if not 'DIRACVersion' in resourceDict: return S_ERROR( 'Version check requested and not provided by Pilot') else: pilotVersion = resourceDict['DIRACVersion'] else: pilotVersion = resourceDict['ReleaseVersion'] validVersions = self.__opsHelper.getValue("Pilot/Version", []) if validVersions and pilotVersion not in validVersions: return S_ERROR( 'Pilot version does not match the production version %s not in ( %s )' % \ ( pilotVersion, ",".join( validVersions ) ) ) #Check project if requested validProject = self.__opsHelper.getValue("Pilot/Project", "") if validProject: if 'ReleaseProject' not in resourceDict: return S_ERROR( "Version check requested but expected project %s not received" % validProject) if resourceDict['ReleaseProject'] != validProject: return S_ERROR( "Version check requested but expected project %s != received %s" % (validProject, resourceDict['ReleaseProject'])) # Update pilot information pilotInfoReported = False pilotReference = resourceDict.get('PilotReference', '') if pilotReference: if "PilotInfoReportedFlag" in resourceDict and not resourceDict[ 'PilotInfoReportedFlag']: gridCE = resourceDict.get('GridCE', 'Unknown') site = resourceDict.get('Site', 'Unknown') benchmark = benchmark = resourceDict.get('PilotBenchmark', 0.0) gLogger.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % (pilotReference, gridCE, site, benchmark)) result = gPilotAgentsDB.setPilotStatus(pilotReference, status='Running', gridSite=site, destination=gridCE, benchmark=benchmark) if result['OK']: pilotInfoReported = True #Check the site mask if not 'Site' in resourceDict: return S_ERROR('Missing Site Name in Resource JDL') # Get common site mask and check the agent site result = self.__siteStatus.getUsableSites('ComputingAccess') if not result['OK']: return S_ERROR('Internal error: can not get site mask') usableSites = result['Value'] siteName = resourceDict['Site'] if siteName not in usableSites: if 'GridCE' not in resourceDict: return S_ERROR('Site not in mask and GridCE not specified') #Even if the site is banned, if it defines a CE, it must be able to check it del resourceDict['Site'] resourceDict['Setup'] = self.serviceInfoDict['clientSetup'] gLogger.verbose("Resource description:") for key in resourceDict: gLogger.verbose("%s : %s" % (key.rjust(20), resourceDict[key])) negativeCond = self.__limiter.getNegativeCondForSite(siteName) result = gTaskQueueDB.matchAndGetJob(resourceDict, negativeCond=negativeCond) if DEBUG: print result if not result['OK']: return result result = result['Value'] if not result['matchFound']: return S_ERROR('No match found') jobID = result['jobId'] resAtt = gJobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup', 'Status']) if not resAtt['OK']: return S_ERROR('Could not retrieve job attributes') if not resAtt['Value']: return S_ERROR('No attributes returned for job') if not resAtt['Value']['Status'] == 'Waiting': gLogger.error('Job matched by the TQ is not in Waiting state', str(jobID)) result = gTaskQueueDB.deleteJob(jobID) if not result['OK']: return result return S_ERROR("Job %s is not in Waiting state" % str(jobID)) attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site'] attValues = ['Matched', 'Assigned', 'Unknown', siteName] result = gJobDB.setJobAttributes(jobID, attNames, attValues) # result = gJobDB.setJobStatus( jobID, status = 'Matched', minor = 'Assigned' ) result = gJobLoggingDB.addLoggingRecord(jobID, status='Matched', minor='Assigned', source='Matcher') result = gJobDB.getJobJDL(jobID) if not result['OK']: return S_ERROR('Failed to get the job JDL') resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime gLogger.info("Match time: [%s]" % str(matchTime)) gMonitor.addMark("matchTime", matchTime) # Get some extra stuff into the response returned resOpt = gJobDB.getJobOptParameters(jobID) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = gJobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup']) if not resAtt['OK']: return S_ERROR('Could not retrieve job attributes') if not resAtt['Value']: return S_ERROR('No attributes returned for job') if self.__opsHelper.getValue("JobScheduling/CheckMatchingDelay", True): self.__limiter.updateDelayCounters(siteName, jobID) # Report pilot-job association if pilotReference: result = gPilotAgentsDB.setCurrentJobID(pilotReference, jobID) result = gPilotAgentsDB.setJobForPilot(jobID, pilotReference, updateStatus=False) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = pilotInfoReported return S_OK(resultDict) ############################################################################## types_requestJob = [[StringType, DictType]] def export_requestJob(self, resourceDescription): """ Serve a job to the request of an agent which is the highest priority one matching the agent's site capacity """ result = self.selectJob(resourceDescription) gMonitor.addMark("matchesDone") if result['OK']: gMonitor.addMark("matchesOK") return result ############################################################################## types_getActiveTaskQueues = [] def export_getActiveTaskQueues(self): """ Return all task queues """ return gTaskQueueDB.retrieveTaskQueues() ############################################################################## types_getMatchingTaskQueues = [DictType] def export_getMatchingTaskQueues(self, resourceDict): """ Return all task queues """ if 'Site' in resourceDict and type( resourceDict['Site']) in StringTypes: negativeCond = self.__limiter.getNegativeCondForSite( resourceDict['Site']) else: negativeCond = self.__limiter.getNegativeCond() return gTaskQueueDB.retrieveTaskQueuesThatMatch( resourceDict, negativeCond=negativeCond) ############################################################################## types_matchAndGetTaskQueue = [DictType] def export_matchAndGetTaskQueue(self, resourceDict): """ Return matching task queues """ return gTaskQueueDB.matchAndGetTaskQueue(resourceDict)
def checkJob(self, job, classAdJob): """This method controls the checking of the job. """ self.log.verbose('Job %s will be processed' % (job)) # Check if the job was recently rescheduled result = self.jobDB.getJobAttributes( job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus']) if not result['OK']: self.log.error(result['Message']) return S_ERROR('Can not get job attributes from JobDB') jobDict = result['Value'] reCounter = int(jobDict['RescheduleCounter']) if reCounter != 0: reTime = fromString(jobDict['RescheduleTime']) delta = toEpoch() - toEpoch(reTime) delay = self.maxRescheduleDelay if reCounter <= len(self.rescheduleDelaysList): delay = self.rescheduleDelaysList[reCounter - 1] if delta < delay: if jobDict['ApplicationStatus'].find( 'On Hold: after rescheduling') == -1: result = self.jobDB.setJobStatus( job, application='On Hold: after rescheduling #%d' % reCounter) return S_OK() # First, get Site and BannedSites from the Job result = self.__getJobSiteRequirement(job, classAdJob) userBannedSites = result['BannedSites'] userSites = result['Sites'] if userSites: userSites = applySiteRequirements(userSites, [], userBannedSites) if not userSites: msg = 'Impossible Site Requirement' return S_ERROR(msg) # Second, get the Active and Banned sites from the RSS siteStatus = SiteStatus() usableSites = siteStatus.getUsableSites('ComputingAccess') unusableSites = siteStatus.getUnusableSites('ComputingAccess') if not (usableSites['OK'] and unusableSites['OK']): if not usableSites['OK']: self.log.error(usableSites['Message']) if not unusableSites['OK']: self.log.error(unusableSites['Message']) return S_ERROR('Can not get Active and Banned Sites from JobDB') usableSites = usableSites['Value'] unusableSites = unusableSites['Value'] if userSites: sites = applySiteRequirements(userSites, usableSites, unusableSites) if not sites: # Put on Hold only non-excluded job types jobType = classAdJob.getAttributeString('JobType') if not jobType in self.excludedOnHoldJobTypes: msg = 'On Hold: Requested site is Banned or not Active' self.log.info(msg) result = self.jobDB.setJobStatus(job, application=msg) return S_OK() # Third, check if there is input data result = self.jobDB.getInputData(job) if not result['OK']: self.log.warn('Failed to get input data from JobDB for %s' % (job)) self.log.error(result['Message']) return S_ERROR('Failed to get input data from JobDB') if not result['Value']: return self.__sendJobToTaskQueue(job, classAdJob, userSites, userBannedSites) hasInputData = False inputData = [] for lfn in result['Value']: if lfn: inputData.append(lfn) hasInputData = True if not hasInputData: #With no input data requirement, job can proceed directly to task queue self.log.verbose('Job %s has no input data requirement' % (job)) return self.__sendJobToTaskQueue(job, classAdJob, userSites, userBannedSites) self.log.verbose('Job %s has an input data requirement ' % (job)) # Fourth, Check all optimizer information result = self.__checkOptimizerInfo(job) if not result['OK']: return result optInfo = result['Value'] #Compare site candidates with current mask optSites = optInfo['SiteCandidates'].keys() self.log.info('Input Data Site Candidates: %s' % (', '.join(optSites))) # Check that it is compatible with user requirements optSites = applySiteRequirements(optSites, userSites, userBannedSites) if not optSites: msg = 'Impossible Site + InputData Requirement' return S_ERROR(msg) sites = applySiteRequirements(optSites, usableSites, unusableSites) if not sites: msg = 'On Hold: InputData Site is Banned or not Active' self.log.info(msg) result = self.jobDB.setJobStatus(job, application=msg) return S_OK() #Set stager request as necessary, optimize for smallest #files on tape if #more than one site candidate left at this point checkStaging = self.__resolveSitesForStaging(job, sites, inputData, optInfo['SiteCandidates']) if not checkStaging['OK']: return checkStaging destinationSites = checkStaging['SiteCandidates'] if not destinationSites: return S_ERROR('No destination sites available') stagingFlag = checkStaging['Value'] if stagingFlag: #Single site candidate chosen and staging required self.log.verbose('Job %s requires staging of input data' % (job)) # set all LFN to disk for the selected site stagingSite = destinationSites[0] siteDict = optInfo['SiteCandidates'][stagingSite] siteDict['disk'] = siteDict['disk'] + siteDict['tape'] siteDict['tape'] = 0 optInfo['SiteCandidates'][stagingSite] = siteDict self.log.verbose( 'Updating %s Optimizer Info for Job %s:' % (self.dataAgentName, job), optInfo) result = self.setOptimizerJobInfo(job, self.dataAgentName, optInfo) if not result['OK']: return result # Site is selected for staging, report it self.log.verbose('Staging site candidate for job %s is %s' % (job, stagingSite)) result = self.__getStagingSites(stagingSite, destinationSites) if not result['OK']: stagingSites = [stagingSite] else: stagingSites = result['Value'] if len(stagingSites) == 1: self.jobDB.setJobAttribute(job, 'Site', stagingSite) else: # Get the name of the site group result = self.__getSiteGroup(stagingSites) if result['OK']: groupName = result['Value'] if groupName: self.jobDB.setJobAttribute(job, 'Site', groupName) else: self.jobDB.setJobAttribute(job, 'Site', 'Multiple') else: self.jobDB.setJobAttribute(job, 'Site', 'Multiple') stagerDict = self.__setStagingRequest(job, stagingSite, optInfo) if not stagerDict['OK']: return stagerDict self.__updateOtherSites(job, stagingSite, stagerDict['Value'], optInfo) return S_OK() else: #No staging required, can proceed to task queue agent and then waiting status self.log.verbose('Job %s does not require staging of input data' % (job)) #Finally send job to TaskQueueAgent return self.__sendJobToTaskQueue(job, classAdJob, destinationSites, userBannedSites)