Ejemplo n.º 1
0
 def export_getSitesResources( self, siteNames ):
   
   resources = Resources.Resources()
   
   if siteNames is None:
     siteNames = Resources.getSites()
     if not siteNames[ 'OK' ]:
       return siteNames
     siteNames = siteNames[ 'Value' ]
   
   if isinstance( siteNames, str ):
     siteNames = [ siteNames ]
   
   sitesRes = {}
   
   for siteName in siteNames:
     
     res = {}         
     res[ 'ces' ] = resources.getEligibleResources( 'Computing', { 'Site': siteName } )
     ses          = resources.getEligibleStorageElements( { 'Site': siteName } )
     sesHosts = CSHelpers.getStorageElementsHosts( ses )
     if not sesHosts[ 'OK' ]:
       return sesHosts
     res[ 'ses' ] = list( set( sesHosts[ 'Value' ] ) )
         
     sitesRes[ siteName ] = res
   
   return S_OK( sitesRes )
Ejemplo n.º 2
0
  def doCommand( self ):
    """ 
    Returns running and runned jobs, querying the WMSHistory  
    for the last self.args[0] hours 
        
    :params:
      :attr:`sites`: list of sites (when not given, take every sites)

    :returns:
      
    """

    if not 'hours' in self.args:
      return S_ERROR( 'Number of hours not specified' )
    hours = self.args[ 'hours' ]

    sites = None
    if 'sites' in self.args:
      sites = self.args[ 'sites' ] 
    if sites is None:      
#FIXME: pointing to the CSHelper instead     
#      sources = self.rsClient.getSite( meta = {'columns': 'SiteName'} )
#      if not sources[ 'OK' ]:
#        return sources 
#      sources = [ si[0] for si in sources[ 'Value' ] ]
      sites = Resources.getSites()      
      if not sites[ 'OK' ]:
        return sites
      sites = sites[ 'Value' ]
    
    if not sites:
      return S_ERROR( 'Sites is empty' )   

    fromD = datetime.utcnow() - timedelta( hours = hours )
    toD   = datetime.utcnow()

    runJobs = self.rClient.getReport( 'WMSHistory', 'NumberOfJobs', fromD, toD, 
                                       {}, 'Site')
    if not runJobs[ 'OK' ]:
      return runJobs 
    runJobs    = runJobs[ 'Value' ]
    
    if not 'data' in runJobs:
      return S_ERROR( 'Missing data key' )
    if not 'granularity' in runJobs:
      return S_ERROR( 'Missing granularity key' )
    
    singlePlots = {}
    
    for site, value in runJobs[ 'data' ].items():
      if site in sites:
        plot                  = {}
        plot[ 'data' ]        = { site: value }
        plot[ 'granularity' ] = runJobs[ 'granularity' ]
        singlePlots[ site ]   = plot
    
    return S_OK( singlePlots )

################################################################################
#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF
Ejemplo n.º 3
0
  def doCommand( self ):
    """ 
    Returns failed jobs using the DIRAC accounting system for every site 
    for the last self.args[0] hours 
        
    :params:
      :attr:`sites`: list of sites (when not given, take every site)

    :returns:
      
    """

    if not 'hours' in self.args:
      return S_ERROR( 'Number of hours not specified' )
    hours = self.args[ 'hours' ]

    sites = None
    if 'sites' in self.args:
      sites = self.args[ 'sites' ] 
    if sites is None:      
#FIXME: pointing to the CSHelper instead     
#      sources = self.rsClient.getSite( meta = {'columns': 'SiteName'} )
#      if not sources[ 'OK' ]:
#        return sources 
#      sources = [ si[0] for si in sources[ 'Value' ] ]
      sites = Resources.getSites()      
      if not sites[ 'OK' ]:
        return sites
      sites = sites[ 'Value' ]
    
    if not sites:
      return S_ERROR( 'Sites is empty' )

    fromD = datetime.utcnow() - timedelta( hours = hours )
    toD   = datetime.utcnow()

    failedPilots = self.rClient.getReport( 'Pilot', 'NumberOfPilots', fromD, toD, 
                                            { 'GridStatus' : [ 'Aborted' ], 
                                             'Site'       : sites
                                            }, 'Site' )
    if not failedPilots[ 'OK' ]:
      return failedPilots 
    failedPilots = failedPilots[ 'Value' ]
       
    if not 'data' in failedPilots:
      return S_ERROR( 'Missing data key' )
    if not 'granularity' in failedPilots:
      return S_ERROR( 'Missing granularity key' ) 
    
    singlePlots = {}

    for site, value in failedPilots[ 'data' ].items():
      if site in sites:
        plot                  = {}
        plot[ 'data' ]        = { site: value }
        plot[ 'granularity' ] = failedPilots[ 'granularity' ] 
        singlePlots[ site ]   = plot
    
    return S_OK( singlePlots )
Ejemplo n.º 4
0
def getGOCSites(diracSites=None):

    # FIXME: THIS SHOULD GO INTO Resources HELPER

    if diracSites is None:
        diracSites = Resources.getSites()
        if not diracSites["OK"]:
            return diracSites
        diracSites = diracSites["Value"]

    gocSites = []

    for diracSite in diracSites:
        gocSite = getGOCSiteName(diracSite)
        if not gocSite["OK"]:
            continue
        gocSites.append(gocSite["Value"])

    return S_OK(list(set(gocSites)))
Ejemplo n.º 5
0
Archivo: Job.py Proyecto: ahaupt/DIRAC
  def setPlatform( self, platform ):
    """Developer function: sets the target platform, e.g. Linux_x86_64_glibc-2.5.
       This platform is in the form of what it is returned by the dirac-platform script
       (or dirac-architecture if your extension provides it)
    """
    kwargs = {'platform':platform}

    if not isinstance( platform, basestring ):
      return self._reportError( "Expected string for platform", **kwargs )

    if not platform.lower() == 'any':
      availablePlatforms = Resources.getDIRACPlatforms()
      if not availablePlatforms['OK']:
        return self._reportError( "Can't check for platform", **kwargs )
      if platform in availablePlatforms['Value']:
        self._addParameter( self.workflow, 'Platform', 'JDL', platform, 'Platform ( Operating System )' )
      else:
        return self._reportError( "Invalid platform", **kwargs )

    return S_OK()
Ejemplo n.º 6
0
Archivo: Job.py Proyecto: petricm/DIRAC
    def setPlatform(self, platform):
        """Developer function: sets the target platform, e.g. Linux_x86_64_glibc-2.5.
       This platform is in the form of what it is returned by the dirac-platform script
       (or dirac-architecture if your extension provides it)
    """
        kwargs = {"platform": platform}

        if not type(platform) == type(" "):
            return self._reportError("Expected string for platform", **kwargs)

        if not platform.lower() == "any":
            availablePlatforms = Resources.getDIRACPlatforms()
            if not availablePlatforms["OK"]:
                return self._reportError("Can't check for platform", **kwargs)
            if platform in availablePlatforms["Value"]:
                self._addParameter(self.workflow, "Platform", "JDL", platform, "Platform ( Operating System )")
            else:
                return self._reportError("Invalid platform", **kwargs)

        return S_OK()
Ejemplo n.º 7
0
  def doMaster( self ):
    '''
      Master method, which looks little bit spaguetti code, sorry !
      - It gets all Sites.
      - It gets all StorageElements
      
      As there is no bulk query, it compares with what we have on the database.
      It queries a portion of them.
    '''

    sites = Resources.getSites()
    if not sites[ 'OK' ]:
      return sites
    sites = sites[ 'Value' ]
  
    ses = self.resources.getEligibleStorageElements()
    if not ses[ 'OK' ]:
      return ses
    ses = ses[ 'Value' ]
      
    elementNames = sites + ses   

#    sourceQuery = self.rmClient.selectTransferCache( meta = { 'columns' : [ 'SourceName' ] } )
#    if not sourceQuery[ 'OK' ]:
#      return sourceQuery
#    sourceQuery = [ element[0] for element in sourceQuery[ 'Value' ] ]
#    
#    sourceElementsToQuery = list( set( elementNames ).difference( set( sourceQuery ) ) )
    gLogger.info( 'Processing %s' % ', '.join( elementNames ) )
 
    for metric in [ 'Quality', 'FailedTransfers' ]:
      for direction in [ 'Source', 'Destination' ]: 
        # 2 hours of window
        result = self.doNew( ( 2, elementNames, direction, metric )  ) 
        if not result[ 'OK' ]:
          self.metrics[ 'failed' ].append( result )
       
    return S_OK( self.metrics )
  
################################################################################
#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF
Ejemplo n.º 8
0
        from DIRAC.FrameworkSystem.Client.BundleDeliveryClient import BundleDeliveryClient
        bdc = BundleDeliveryClient()
        result = bdc.syncCAs()
        if result['OK']:
            result = bdc.syncCRLs()
    except:
        DIRAC.gLogger.exception('Could not import BundleDeliveryClient')
        pass
    if not skipCAChecks:
        Script.localCfg.deleteOption('/DIRAC/Security/SkipCAChecks')

if ceName or siteName:
    # This is used in the pilot context, we should have a proxy and access to CS
    Script.enableCS()

    resources = Resources.Resources(vo=vo)

    if not siteName:
        if ceName:
            result = resources.getSiteForResource('Computing', ceName)
            if result['OK']:
                site = result['Value']
                result = resources.getSiteFullName(site)
                if result['OK']:
                    siteName = result['Value']

    if siteName:
        DIRAC.gLogger.notice('Setting /LocalSite/Site = %s' % siteName)
        Script.localCfg.addDefaultEntry('/LocalSite/Site', siteName)
        DIRAC.__siteName = False
        if ceName:
Ejemplo n.º 9
0
    def createVMs(self):
        """Go through defined computing elements and submit jobs if necessary"""

        vmTypeList = list(self.vmTypeDict.keys())

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {"Setup": setup, "CPUTime": 9999999}
        if self.vo:
            tqDict["VO"] = self.vo
        if self.voGroups:
            tqDict["OwnerGroup"] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result["OK"]:
            return result
        tqDict["Platform"] = result["Value"]
        tqDict["Site"] = self.sites
        tags = []
        for vmType in vmTypeList:
            if "Tag" in self.vmTypeDict[vmType]["ParametersDict"]:
                tags += self.vmTypeDict[vmType]["ParametersDict"]["Tag"]
        tqDict["Tag"] = list(set(tags))

        self.log.verbose("Checking overall TQ availability with requirements")
        self.log.verbose(tqDict)

        matcherClient = MatcherClient()
        result = matcherClient.getMatchingTaskQueues(tqDict)
        if not result["OK"]:
            return result
        if not result["Value"]:
            self.log.verbose("No Waiting jobs suitable for the director")
            return S_OK()

        jobSites = set()
        anySite = False
        testSites = set()
        totalWaitingJobs = 0
        for tqID in result["Value"]:
            if "Sites" in result["Value"][tqID]:
                for site in result["Value"][tqID]["Sites"]:
                    if site.lower() != "any":
                        jobSites.add(site)
                    else:
                        anySite = True
            else:
                anySite = True
            if "JobTypes" in result["Value"][tqID]:
                if "Sites" in result["Value"][tqID]:
                    for site in result["Value"][tqID]["Sites"]:
                        if site.lower() != "any":
                            testSites.add(site)
            totalWaitingJobs += result["Value"][tqID]["Jobs"]

        tqIDList = list(result["Value"].keys())

        result = virtualMachineDB.getInstanceCounters("Status", {})
        totalVMs = 0
        if result["OK"]:
            for status in result["Value"]:
                if status in ["New", "Submitted", "Running"]:
                    totalVMs += result["Value"][status]
        self.log.info("Total %d jobs in %d task queues with %d VMs" %
                      (totalWaitingJobs, len(tqIDList), totalVMs))

        # Check if the site is allowed in the mask
        result = self.siteClient.getUsableSites()
        if not result["OK"]:
            return S_ERROR("Can not get the site mask")
        siteMaskList = result.get("Value", [])

        vmTypeList = list(self.vmTypeDict.keys())
        random.shuffle(vmTypeList)
        totalSubmittedPilots = 0
        matchedQueues = 0
        for vmType in vmTypeList:
            ce = self.vmTypeDict[vmType]["CE"]
            ceName = self.vmTypeDict[vmType]["CEName"]
            vmTypeName = self.vmTypeDict[vmType]["VMType"]
            siteName = self.vmTypeDict[vmType]["Site"]
            platform = self.vmTypeDict[vmType]["Platform"]
            vmTypeTags = self.vmTypeDict[vmType]["ParametersDict"].get(
                "Tag", [])
            siteMask = siteName in siteMaskList
            endpoint = "%s::%s" % (siteName, ceName)
            maxInstances = int(self.vmTypeDict[vmType]["MaxInstances"])
            processorTags = []

            # vms support WholeNode naturally
            processorTags.append("WholeNode")

            if not anySite and siteName not in jobSites:
                self.log.verbose(
                    "Skipping queue %s at %s: no workload expected" %
                    (vmTypeName, siteName))
                continue
            if not siteMask and siteName not in testSites:
                self.log.verbose("Skipping queue %s: site %s not in the mask" %
                                 (vmTypeName, siteName))
                continue

            if "CPUTime" in self.vmTypeDict[vmType]["ParametersDict"]:
                vmTypeCPUTime = int(
                    self.vmTypeDict[vmType]["ParametersDict"]["CPUTime"])
            else:
                self.log.warn(
                    "CPU time limit is not specified for queue %s, skipping..."
                    % vmType)
                continue

            # Prepare the queue description to look for eligible jobs
            ceDict = ce.getParameterDict()

            if not siteMask:
                ceDict["JobType"] = "Test"
            if self.vo:
                ceDict["VO"] = self.vo
            if self.voGroups:
                ceDict["OwnerGroup"] = self.voGroups

            result = Resources.getCompatiblePlatforms(platform)
            if not result["OK"]:
                continue
            ceDict["Platform"] = result["Value"]

            ceDict["Tag"] = list(set(processorTags + vmTypeTags))

            # Get the number of eligible jobs for the target site/queue

            result = matcherClient.getMatchingTaskQueues(ceDict)
            if not result["OK"]:
                self.log.error(
                    "Could not retrieve TaskQueues from TaskQueueDB",
                    result["Message"])
                return result
            taskQueueDict = result["Value"]
            if not taskQueueDict:
                self.log.verbose("No matching TQs found for %s" % vmType)
                continue

            matchedQueues += 1
            totalTQJobs = 0
            tqIDList = list(taskQueueDict.keys())
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]["Jobs"]

            self.log.verbose(
                "%d job(s) from %d task queue(s) are eligible for %s queue" %
                (totalTQJobs, len(tqIDList), vmType))

            # Get the number of already instantiated VMs for these task queues
            totalWaitingVMs = 0
            result = virtualMachineDB.getInstanceCounters(
                "Status", {"Endpoint": endpoint})
            if result["OK"]:
                for status in result["Value"]:
                    if status in ["New", "Submitted"]:
                        totalWaitingVMs += result["Value"][status]
            if totalWaitingVMs >= totalTQJobs:
                self.log.verbose("%d VMs already for all the available jobs" %
                                 totalWaitingVMs)

            self.log.verbose(
                "%d VMs for the total of %d eligible jobs for %s" %
                (totalWaitingVMs, totalTQJobs, vmType))

            # Get proxy to be used to connect to the cloud endpoint
            authType = ce.parameters.get("Auth")
            if authType and authType.lower() in ["x509", "voms"]:
                self.log.verbose("Getting cloud proxy for %s/%s" %
                                 (siteName, ceName))
                result = getProxyFileForCloud(ce)
                if not result["OK"]:
                    continue
                ce.setProxy(result["Value"])

            # Get the number of available slots on the target site/endpoint
            totalSlots = self.getVMInstances(endpoint, maxInstances)
            if totalSlots == 0:
                self.log.debug("%s: No slots available" % vmType)
                continue

            vmsToSubmit = max(0, min(totalSlots,
                                     totalTQJobs - totalWaitingVMs))
            self.log.info("%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d" %
                          (vmType, totalSlots, totalTQJobs, totalWaitingVMs,
                           vmsToSubmit))

            # Limit the number of VM instances to create to vmsToSubmit
            vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit)
            if vmsToSubmit == 0:
                continue

            self.log.info("Going to submit %d VMs to %s queue" %
                          (vmsToSubmit, vmType))
            result = ce.createInstances(vmsToSubmit)

            # result = S_OK()
            if not result["OK"]:
                self.log.error("Failed submission to queue %s:\n" % vmType,
                               result["Message"])
                self.failedVMTypes.setdefault(vmType, 0)
                self.failedVMTypes[vmType] += 1
                continue

            # Add VMs to the VirtualMachineDB
            vmDict = result["Value"]
            totalSubmittedPilots += len(vmDict)
            self.log.info("Submitted %d VMs to %s@%s" %
                          (len(vmDict), vmTypeName, ceName))

            pilotList = []
            for uuID in vmDict:
                diracUUID = vmDict[uuID]["InstanceID"]
                endpoint = "%s::%s" % (self.vmTypeDict[vmType]["Site"], ceName)
                result = virtualMachineDB.insertInstance(
                    uuID, vmTypeName, diracUUID, endpoint, self.vo)
                if not result["OK"]:
                    continue
                pRef = "vm://" + ceName + "/" + diracUUID + ":00"
                pilotList.append(pRef)

            stampDict = {}
            tqPriorityList = []
            sumPriority = 0.0
            for tq in taskQueueDict:
                sumPriority += taskQueueDict[tq]["Priority"]
                tqPriorityList.append((tq, sumPriority))
            tqDict = {}
            for pilotID in pilotList:
                rndm = random.random() * sumPriority
                for tq, prio in tqPriorityList:
                    if rndm < prio:
                        tqID = tq
                        break
                if tqID not in tqDict:
                    tqDict[tqID] = []
                tqDict[tqID].append(pilotID)

            for tqID, pilotList in tqDict.items():
                result = pilotAgentsDB.addPilotTQReference(
                    pilotList, tqID, "", "", self.localhost, "Cloud",
                    stampDict)
                if not result["OK"]:
                    self.log.error(
                        "Failed to insert pilots into the PilotAgentsDB: %s" %
                        result["Message"])

        self.log.info(
            "%d VMs submitted in total in this cycle, %d matched queues" %
            (totalSubmittedPilots, matchedQueues))
        return S_OK()
Ejemplo n.º 10
0
    def getEndpoints(self, resourceDict):
        """Get the list of relevant CEs and their descriptions"""

        self.vmTypeDict = {}
        ceFactory = EndpointFactory()

        result = getPilotBootstrapParameters(vo=self.vo,
                                             runningPod=self.runningPod)
        if not result["OK"]:
            return result
        opParameters = result["Value"]

        for site in resourceDict:
            for ce in resourceDict[site]:
                ceDict = resourceDict[site][ce]
                ceTags = ceDict.get("Tag", [])
                if isinstance(ceTags, str):
                    ceTags = fromChar(ceTags)
                ceMaxRAM = ceDict.get("MaxRAM", None)
                qDict = ceDict.pop("VMTypes")
                for vmType in qDict:
                    vmTypeName = "%s_%s" % (ce, vmType)
                    self.vmTypeDict[vmTypeName] = {}
                    self.vmTypeDict[vmTypeName]["ParametersDict"] = qDict[
                        vmType]
                    self.vmTypeDict[vmTypeName]["ParametersDict"][
                        "VMType"] = vmType
                    self.vmTypeDict[vmTypeName]["ParametersDict"][
                        "Site"] = site
                    self.vmTypeDict[vmTypeName]["ParametersDict"][
                        "Setup"] = gConfig.getValue("/DIRAC/Setup", "unknown")
                    self.vmTypeDict[vmTypeName]["ParametersDict"][
                        "CPUTime"] = 99999999

                    vmTypeTags = self.vmTypeDict[vmTypeName][
                        "ParametersDict"].get("Tag")
                    if vmTypeTags and isinstance(vmTypeTags, str):
                        vmTypeTags = fromChar(vmTypeTags)
                        self.vmTypeDict[vmTypeName]["ParametersDict"][
                            "Tag"] = vmTypeTags
                    if ceTags:
                        if vmTypeTags:
                            allTags = list(set(ceTags + vmTypeTags))
                            self.vmTypeDict[vmTypeName]["ParametersDict"][
                                "Tag"] = allTags
                        else:
                            self.vmTypeDict[vmTypeName]["ParametersDict"][
                                "Tag"] = ceTags

                    maxRAM = self.vmTypeDict[vmTypeName]["ParametersDict"].get(
                        "MaxRAM")
                    maxRAM = ceMaxRAM if not maxRAM else maxRAM
                    if maxRAM:
                        self.vmTypeDict[vmTypeName]["ParametersDict"][
                            "MaxRAM"] = maxRAM

                    ceWholeNode = ceDict.get("WholeNode", "true")
                    wholeNode = self.vmTypeDict[vmTypeName][
                        "ParametersDict"].get("WholeNode", ceWholeNode)
                    if wholeNode.lower() in ("yes", "true"):
                        self.vmTypeDict[vmTypeName][
                            "ParametersDict"].setdefault("Tag", [])
                        self.vmTypeDict[vmTypeName]["ParametersDict"][
                            "Tag"].append("WholeNode")

                    platform = ""
                    if "Platform" in self.vmTypeDict[vmTypeName][
                            "ParametersDict"]:
                        platform = self.vmTypeDict[vmTypeName][
                            "ParametersDict"]["Platform"]
                    elif "Platform" in ceDict:
                        platform = ceDict["Platform"]
                    if platform and platform not in self.platforms:
                        self.platforms.append(platform)

                    if "Platform" not in self.vmTypeDict[vmTypeName][
                            "ParametersDict"] and platform:
                        result = Resources.getDIRACPlatform(platform)
                        if result["OK"]:
                            self.vmTypeDict[vmTypeName]["ParametersDict"][
                                "Platform"] = result["Value"][0]

                    ceVMTypeDict = dict(ceDict)
                    ceVMTypeDict["CEName"] = ce
                    ceVMTypeDict["VO"] = self.vo
                    ceVMTypeDict["VMType"] = vmType
                    ceVMTypeDict["RunningPod"] = self.runningPod
                    ceVMTypeDict["CSServers"] = gConfig.getValue(
                        "/DIRAC/Configuration/Servers", [])
                    ceVMTypeDict.update(
                        self.vmTypeDict[vmTypeName]["ParametersDict"])

                    # Allow a resource-specifc CAPath to be set (as some clouds have their own CAs)
                    # Otherwise fall back to the system-wide default(s)
                    if "CAPath" not in ceVMTypeDict:
                        ceVMTypeDict["CAPath"] = gConfig.getValue(
                            "/DIRAC/Security/CAPath",
                            "/opt/dirac/etc/grid-security/certificates/cas.pem"
                        )

                    # Generate the CE object for the vmType or pick the already existing one
                    # if the vmType definition did not change
                    vmTypeHash = self.__generateVMTypeHash(ceVMTypeDict)
                    if vmTypeName in self.vmTypeCECache and self.vmTypeCECache[
                            vmTypeName]["Hash"] == vmTypeHash:
                        vmTypeCE = self.vmTypeCECache[vmTypeName]["CE"]
                    else:
                        result = ceFactory.getCEObject(parameters=ceVMTypeDict)
                        if not result["OK"]:
                            return result
                        self.vmTypeCECache.setdefault(vmTypeName, {})
                        self.vmTypeCECache[vmTypeName]["Hash"] = vmTypeHash
                        self.vmTypeCECache[vmTypeName]["CE"] = result["Value"]
                        vmTypeCE = self.vmTypeCECache[vmTypeName]["CE"]
                        vmTypeCE.setBootstrapParameters(opParameters)

                    self.vmTypeDict[vmTypeName]["CE"] = vmTypeCE
                    self.vmTypeDict[vmTypeName]["CEName"] = ce
                    self.vmTypeDict[vmTypeName]["CEType"] = ceDict["CEType"]
                    self.vmTypeDict[vmTypeName]["Site"] = site
                    self.vmTypeDict[vmTypeName]["VMType"] = vmType
                    self.vmTypeDict[vmTypeName]["Platform"] = platform
                    self.vmTypeDict[vmTypeName]["MaxInstances"] = ceDict[
                        "MaxInstances"]
                    if not self.vmTypeDict[vmTypeName]["CE"].isValid():
                        self.log.error(
                            "Failed to instantiate CloudEndpoint for %s" %
                            vmTypeName)
                        continue

                    if site not in self.sites:
                        self.sites.append(site)

        return S_OK()
Ejemplo n.º 11
0
    def doCommand(self):
        """ 
    Returns running and runned jobs, querying the WMSHistory  
    for the last self.args[0] hours 
        
    :params:
      :attr:`sites`: list of sites (when not given, take every sites)

    :returns:
      
    """

        if not 'hours' in self.args:
            return S_ERROR('Number of hours not specified')
        hours = self.args['hours']

        sites = None
        if 'sites' in self.args:
            sites = self.args['sites']
        if sites is None:
            #FIXME: pointing to the CSHelper instead
            #      sources = self.rsClient.getSite( meta = {'columns': 'SiteName'} )
            #      if not sources[ 'OK' ]:
            #        return sources
            #      sources = [ si[0] for si in sources[ 'Value' ] ]
            sites = Resources.getSites()
            if not sites['OK']:
                return sites
            sites = sites['Value']

        if not sites:
            return S_ERROR('Sites is empty')

        fromD = datetime.utcnow() - timedelta(hours=hours)
        toD = datetime.utcnow()

        runJobs = self.rClient.getReport('WMSHistory', 'NumberOfJobs', fromD,
                                         toD, {}, 'Site')
        if not runJobs['OK']:
            return runJobs
        runJobs = runJobs['Value']

        if not 'data' in runJobs:
            return S_ERROR('Missing data key')
        if not 'granularity' in runJobs:
            return S_ERROR('Missing granularity key')

        singlePlots = {}

        for site, value in runJobs['data'].items():
            if site in sites:
                plot = {}
                plot['data'] = {site: value}
                plot['granularity'] = runJobs['granularity']
                singlePlots[site] = plot

        return S_OK(singlePlots)


################################################################################
#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF
Ejemplo n.º 12
0
    def beginExecution(self):

        self.gridEnv = self.am_getOption("GridEnv", getGridEnv())
        # The SiteDirector is for a particular user community
        self.vo = self.am_getOption("Community", "")
        if not self.vo:
            self.vo = CSGlobals.getVO()
        # The SiteDirector is for a particular user group
        self.group = self.am_getOption("Group", "")
        # self.voGroups contain all the eligible user groups for pilots submutted by this SiteDirector
        self.voGroups = []

        # Choose the group for which pilots will be submitted. This is a hack until
        # we will be able to match pilots to VOs.
        if not self.group:
            if self.vo:
                result = Registry.getGroupsForVO(self.vo)
                if not result["OK"]:
                    return result
                for group in result["Value"]:
                    if "NormalUser" in Registry.getPropertiesForGroup(group):
                        self.voGroups.append(group)
        else:
            self.voGroups = [self.group]

        result = findGenericPilotCredentials(vo=self.vo)
        if not result["OK"]:
            return result
        self.pilotDN, self.pilotGroup = result["Value"]
        self.pilotDN = self.am_getOption("PilotDN", self.pilotDN)
        self.pilotGroup = self.am_getOption("PilotGroup", self.pilotGroup)

        self.platforms = []
        self.sites = []
        self.defaultSubmitPools = ""
        if self.group:
            self.defaultSubmitPools = Registry.getGroupOption(self.group, "SubmitPools", "")
        elif self.vo:
            self.defaultSubmitPools = Registry.getVOOption(self.vo, "SubmitPools", "")

        self.pilot = self.am_getOption("PilotScript", DIRAC_PILOT)
        self.install = DIRAC_INSTALL
        self.workingDirectory = self.am_getOption("WorkDirectory")
        self.maxQueueLength = self.am_getOption("MaxQueueLength", 86400 * 3)
        self.pilotLogLevel = self.am_getOption("PilotLogLevel", "INFO")
        self.maxJobsInFillMode = self.am_getOption("MaxJobsInFillMode", self.maxJobsInFillMode)
        self.maxPilotsToSubmit = self.am_getOption("MaxPilotsToSubmit", self.maxPilotsToSubmit)
        self.pilotWaitingFlag = self.am_getOption("PilotWaitingFlag", True)
        self.pilotWaitingTime = self.am_getOption("MaxPilotWaitingTime", 7200)

        # Flags
        self.updateStatus = self.am_getOption("UpdatePilotStatus", True)
        self.getOutput = self.am_getOption("GetPilotOutput", True)
        self.sendAccounting = self.am_getOption("SendPilotAccounting", True)

        # Get the site description dictionary
        siteNames = None
        if not self.am_getOption("Site", "Any").lower() == "any":
            siteNames = self.am_getOption("Site", [])
        ceTypes = None
        if not self.am_getOption("CETypes", "Any").lower() == "any":
            ceTypes = self.am_getOption("CETypes", [])
        ces = None
        if not self.am_getOption("CEs", "Any").lower() == "any":
            ces = self.am_getOption("CEs", [])
        result = Resources.getQueues(
            community=self.vo, siteList=siteNames, ceList=ces, ceTypeList=ceTypes, mode="Direct"
        )
        if not result["OK"]:
            return result
        resourceDict = result["Value"]
        result = self.getQueues(resourceDict)
        if not result["OK"]:
            return result

        # if not siteNames:
        #  siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' )
        #  if siteName == 'Unknown':
        #    return S_OK( 'No site specified for the SiteDirector' )
        #  else:
        #    siteNames = [siteName]
        # self.siteNames = siteNames

        if self.updateStatus:
            self.log.always("Pilot status update requested")
        if self.getOutput:
            self.log.always("Pilot output retrieval requested")
        if self.sendAccounting:
            self.log.always("Pilot accounting sending requested")

        self.log.always("Sites:", siteNames)
        self.log.always("CETypes:", ceTypes)
        self.log.always("CEs:", ces)
        self.log.always("PilotDN:", self.pilotDN)
        self.log.always("PilotGroup:", self.pilotGroup)
        self.log.always("MaxPilotsToSubmit:", self.maxPilotsToSubmit)
        self.log.always("MaxJobsInFillMode:", self.maxJobsInFillMode)

        self.localhost = socket.getfqdn()
        self.proxy = ""

        if self.queueDict:
            self.log.always("Agent will serve queues:")
            for queue in self.queueDict:
                self.log.always(
                    "Site: %s, CE: %s, Queue: %s"
                    % (self.queueDict[queue]["Site"], self.queueDict[queue]["CEName"], queue)
                )

        return S_OK()
Ejemplo n.º 13
0
def main():
    global fullMatch
    global sites
    Script.registerSwitch("F", "full-match", "Check all the matching criteria",
                          setFullMatch)
    Script.registerSwitch(
        "S:", "site=", "Check matching for these sites (comma separated list)",
        setSites)
    Script.registerArgument("job_JDL: file with job JDL description")
    _, args = Script.parseCommandLine(ignoreErrors=True)

    from DIRAC.Core.Security.ProxyInfo import getVOfromProxyGroup
    from DIRAC.ConfigurationSystem.Client.Helpers import Resources
    from DIRAC.Core.Utilities.PrettyPrint import printTable
    from DIRAC.ResourceStatusSystem.Client.ResourceStatus import ResourceStatus
    from DIRAC.ResourceStatusSystem.Client.SiteStatus import SiteStatus
    from DIRAC.WorkloadManagementSystem.Utilities.QueueUtilities import getQueuesResolved, matchQueue

    with open(args[0]) as f:
        jdl = f.read()

    # Get the current VO
    result = getVOfromProxyGroup()
    if not result["OK"]:
        gLogger.error("No proxy found, please login")
        DIRACExit(-1)
    voName = result["Value"]

    resultQueues = Resources.getQueues(siteList=sites, community=voName)
    if not resultQueues["OK"]:
        gLogger.error("Failed to get CE information")
        DIRACExit(-1)
    siteDict = resultQueues["Value"]
    result = getQueuesResolved(siteDict, {}, checkPlatform=True)
    if not resultQueues["OK"]:
        gLogger.error("Failed to get CE information")
        DIRACExit(-1)
    queueDict = result["Value"]

    # get list of usable sites within this cycle
    resultMask = SiteStatus().getUsableSites()
    if not resultMask["OK"]:
        gLogger.error("Failed to get Site mask information")
        DIRACExit(-1)
    siteMaskList = resultMask.get("Value", [])

    rssClient = ResourceStatus()

    fields = ("Site", "CE", "Queue", "Status", "Match", "Reason")
    records = []

    for queue, queueInfo in queueDict.items():
        site = queueInfo["Site"]
        ce = queueInfo["CEName"]
        siteStatus = "Active" if site in siteMaskList else "InActive"
        ceStatus = siteStatus
        if rssClient.rssFlag:
            result = rssClient.getElementStatus(ce, "ComputingElement")
            if result["OK"]:
                ceStatus = result["Value"][ce]["all"]

        result = matchQueue(jdl,
                            queueInfo["ParametersDict"],
                            fullMatch=fullMatch)
        if not result["OK"]:
            gLogger.error("Failed in getting match data", result["Message"])
            DIRACExit(-1)
        status = "Active" if siteStatus == "Active" and ceStatus == "Active" else "Inactive"
        if result["Value"]["Match"]:
            records.append(
                (site, ce, queueInfo["QueueName"], status, "Yes", ""))
        else:
            records.append((site, ce, queueInfo["QueueName"], status, "No",
                            result["Value"]["Reason"]))

    gLogger.notice(
        printTable(fields,
                   records,
                   sortField="Site",
                   columnSeparator="  ",
                   printOut=False))
Ejemplo n.º 14
0
  def getQueues( self, resourceDict ):
    """ Get the list of relevant CEs and their descriptions
    """

    self.queueDict = {}
    ceFactory = ComputingElementFactory()

    for site in resourceDict:
      result = self._resources.getSiteFullName( site )
      if not result['OK']:
        continue
      siteFullName = result['Value']
      for ce in resourceDict[site]:
        ceDict = resourceDict[site][ce]
        qDict = ceDict.pop( 'Queues' )
        for queue in qDict:
          queueName = '%s_%s' % ( ce, queue )
          self.queueDict[queueName] = {}
          self.queueDict[queueName]['ParametersDict'] = qDict[queue]
          self.queueDict[queueName]['ParametersDict']['Queue'] = queue
          self.queueDict[queueName]['ParametersDict']['Site'] = siteFullName
          self.queueDict[queueName]['ParametersDict']['GridEnv'] = self.gridEnv
          self.queueDict[queueName]['ParametersDict']['Setup'] = gConfig.getValue( '/DIRAC/Setup', 'unknown' )
          # Evaluate the CPU limit of the queue according to the Glue convention
          # To Do: should be a utility
          if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \
             "SI00" in self.queueDict[queueName]['ParametersDict']:
            maxCPUTime = float( self.queueDict[queueName]['ParametersDict']['maxCPUTime'] )
            # For some sites there are crazy values in the CS
            maxCPUTime = max( maxCPUTime, 0 )
            maxCPUTime = min( maxCPUTime, 86400 * 12.5 )
            si00 = float( self.queueDict[queueName]['ParametersDict']['SI00'] )
            queueCPUTime = 60. / 250. * maxCPUTime * si00
            self.queueDict[queueName]['ParametersDict']['CPUTime'] = int( queueCPUTime )
          qwDir = os.path.join( self.workingDirectory, queue )
          if not os.path.exists( qwDir ):
            os.makedirs( qwDir )
          self.queueDict[queueName]['ParametersDict']['WorkingDirectory'] = qwDir

          platform = ''
          if "Platform" in self.queueDict[queueName]['ParametersDict']:
            platform = self.queueDict[queueName]['ParametersDict']['Platform']
          elif "Platform" in ceDict:
            platform = ceDict['Platform']
          elif "OS" in ceDict:
            architecture = ceDict.get( 'architecture', 'x86_64' )
            OS = ceDict['OS']
            platform = '_'.join( [architecture, OS] )
          if platform and not platform in self.platforms:
            self.platforms.append( platform )

          if not "Platform" in self.queueDict[queueName]['ParametersDict'] and platform:
            result = Resources.getDIRACPlatform( platform )
            if result['OK']:
              self.queueDict[queueName]['ParametersDict']['Platform'] = result['Value']

          ceQueueDict = dict( ceDict )
          ceQueueDict.update( self.queueDict[queueName]['ParametersDict'] )
          result = ceFactory.getCE( ceName = ce,
                                    ceType = ceDict['CEType'],
                                    ceParametersDict = ceQueueDict )
          if not result['OK']:
            return result
          self.queueDict[queueName]['CE'] = result['Value']
          self.queueDict[queueName]['CEName'] = ce
          self.queueDict[queueName]['CEType'] = ceDict['CEType']
          self.queueDict[queueName]['Site'] = siteFullName
          self.queueDict[queueName]['QueueName'] = queue
          self.queueDict[queueName]['Platform'] = platform
          result = self.queueDict[queueName]['CE'].isValid()
          if not result['OK']:
            self.log.fatal( result['Message'] )
            return result
          if 'BundleProxy' in self.queueDict[queueName]['ParametersDict']:
            self.queueDict[queueName]['BundleProxy'] = True
          elif 'BundleProxy' in ceDict:
            self.queueDict[queueName]['BundleProxy'] = True

          if siteFullName not in self.sites:
            self.sites.append( siteFullName )

    return S_OK()
Ejemplo n.º 15
0
from DIRAC.ResourceStatusSystem.Client.ResourceStatus import ResourceStatus
from DIRAC.ResourceStatusSystem.Client.SiteStatus import SiteStatus
from DIRAC.WorkloadManagementSystem.Utilities.QueueUtilities import getQueuesResolved, matchQueue

if __name__ == '__main__':
    with open(args[0]) as f:
        jdl = f.read()

    # Get the current VO
    result = getVOfromProxyGroup()
    if not result['OK']:
        gLogger.error('No proxy found, please login')
        DIRACExit(-1)
    voName = result['Value']

    resultQueues = Resources.getQueues(siteList=sites, community=voName)
    if not resultQueues['OK']:
        gLogger.error('Failed to get CE information')
        DIRACExit(-1)
    siteDict = resultQueues['Value']
    result = getQueuesResolved(siteDict)
    if not resultQueues['OK']:
        gLogger.error('Failed to get CE information')
        DIRACExit(-1)
    queueDict = result['Value']

    # get list of usable sites within this cycle
    resultMask = SiteStatus().getUsableSites()
    if not resultMask['OK']:
        gLogger.error('Failed to get Site mask information')
        DIRACExit(-1)
Ejemplo n.º 16
0
  def getQueues( self, resourceDict ):
    """ Get the list of relevant CEs and their descriptions
    """

    self.queueDict = {}
    ceFactory = ComputingElementFactory()

    for site in resourceDict:
      for ce in resourceDict[site]:
        ceDict = resourceDict[site][ce]
        ceTags = ceDict.get( 'Tag', [] )
        pilotRunDirectory = ceDict.get( 'PilotRunDirectory', '' )
        if isinstance( ceTags, basestring ):
          ceTags = fromChar( ceTags )
        ceMaxRAM = ceDict.get( 'MaxRAM', None )
        qDict = ceDict.pop( 'Queues' )
        for queue in qDict:
          queueName = '%s_%s' % ( ce, queue )
          self.queueDict[queueName] = {}
          self.queueDict[queueName]['ParametersDict'] = qDict[queue]
          self.queueDict[queueName]['ParametersDict']['Queue'] = queue
          self.queueDict[queueName]['ParametersDict']['Site'] = site
          self.queueDict[queueName]['ParametersDict']['GridEnv'] = self.gridEnv
          self.queueDict[queueName]['ParametersDict']['Setup'] = gConfig.getValue( '/DIRAC/Setup', 'unknown' )
          # Evaluate the CPU limit of the queue according to the Glue convention
          # To Do: should be a utility
          if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \
             "SI00" in self.queueDict[queueName]['ParametersDict']:
            maxCPUTime = float( self.queueDict[queueName]['ParametersDict']['maxCPUTime'] )
            # For some sites there are crazy values in the CS
            maxCPUTime = max( maxCPUTime, 0 )
            maxCPUTime = min( maxCPUTime, 86400 * 12.5 )
            si00 = float( self.queueDict[queueName]['ParametersDict']['SI00'] )
            queueCPUTime = 60. / 250. * maxCPUTime * si00
            self.queueDict[queueName]['ParametersDict']['CPUTime'] = int( queueCPUTime )

          queueTags = self.queueDict[queueName]['ParametersDict'].get( 'Tag' )
          if queueTags and isinstance( queueTags, basestring ):
            queueTags = fromChar( queueTags )
            self.queueDict[queueName]['ParametersDict']['Tag'] = queueTags
          if ceTags:
            if queueTags:
              allTags = list( set( ceTags + queueTags ) )
              self.queueDict[queueName]['ParametersDict']['Tag'] = allTags
            else:
              self.queueDict[queueName]['ParametersDict']['Tag'] = ceTags

          maxRAM = self.queueDict[queueName]['ParametersDict'].get( 'MaxRAM' )
          maxRAM = ceMaxRAM if not maxRAM else maxRAM
          if maxRAM:
            self.queueDict[queueName]['ParametersDict']['MaxRAM'] = maxRAM
          if pilotRunDirectory:
            self.queueDict[queueName]['ParametersDict']['JobExecDir'] = pilotRunDirectory
          qwDir = os.path.join( self.workingDirectory, queue )
          mkDir(qwDir)
          self.queueDict[queueName]['ParametersDict']['WorkingDirectory'] = qwDir
          platform = ''
          if "Platform" in self.queueDict[queueName]['ParametersDict']:
            platform = self.queueDict[queueName]['ParametersDict']['Platform']
          elif "Platform" in ceDict:
            platform = ceDict['Platform']
          elif "OS" in ceDict:
            architecture = ceDict.get( 'architecture', 'x86_64' )
            OS = ceDict['OS']
            platform = '_'.join( [architecture, OS] )
          if platform and not platform in self.platforms:
            self.platforms.append( platform )

          if not "Platform" in self.queueDict[queueName]['ParametersDict'] and platform:
            result = Resources.getDIRACPlatform( platform )
            if result['OK']:
              self.queueDict[queueName]['ParametersDict']['Platform'] = result['Value'][0]

          ceQueueDict = dict( ceDict )
          ceQueueDict.update( self.queueDict[queueName]['ParametersDict'] )

          # Generate the CE object for the queue or pick the already existing one
          # if the queue definition did not change
          queueHash = self.__generateQueueHash( ceQueueDict )
          if queueName in self.queueCECache and self.queueCECache[queueName]['Hash'] == queueHash:
            queueCE = self.queueCECache[queueName]['CE']
          else:
            result = ceFactory.getCE( ceName = ce,
                                      ceType = ceDict['CEType'],
                                      ceParametersDict = ceQueueDict )
            if not result['OK']:
              return result
            self.queueCECache.setdefault( queueName, {} )
            self.queueCECache[queueName]['Hash'] = queueHash
            self.queueCECache[queueName]['CE'] = result['Value']
            queueCE = self.queueCECache[queueName]['CE']

          self.queueDict[queueName]['CE'] = queueCE
          self.queueDict[queueName]['CEName'] = ce
          self.queueDict[queueName]['CEType'] = ceDict['CEType']
          self.queueDict[queueName]['Site'] = site
          self.queueDict[queueName]['QueueName'] = queue
          self.queueDict[queueName]['Platform'] = platform
          result = self.queueDict[queueName]['CE'].isValid()
          if not result['OK']:
            self.log.fatal( result['Message'] )
            return result
          if 'BundleProxy' in self.queueDict[queueName]['ParametersDict']:
            if self.queueDict[queueName]['ParametersDict']['BundleProxy'].lower() in ['true','yes','1']:
              self.queueDict[queueName]['BundleProxy'] = True
          elif 'BundleProxy' in ceDict:
            if ceDict['BundleProxy'].lower() in ['true','yes','1']:
              self.queueDict[queueName]['BundleProxy'] = True

          if site not in self.sites:
            self.sites.append( site )

    return S_OK()
Ejemplo n.º 17
0
  def beginExecution( self ):

    self.gridEnv = self.am_getOption( "GridEnv", getGridEnv() )
    # The SiteDirector is for a particular user community
    self.vo = self.am_getOption( "VO", '' )
    if not self.vo:
      self.vo = self.am_getOption( "Community", '' )
    if not self.vo:
      self.vo = CSGlobals.getVO()
    # The SiteDirector is for a particular user group
    self.group = self.am_getOption( "Group", '' )

    # Choose the group for which pilots will be submitted. This is a hack until
    # we will be able to match pilots to VOs.
    if not self.group:
      if self.vo:
        result = Registry.getGroupsForVO( self.vo )
        if not result['OK']:
          return result
        self.voGroups = []
        for group in result['Value']:
          if 'NormalUser' in Registry.getPropertiesForGroup( group ):
            self.voGroups.append( group )
    else:
      self.voGroups = [ self.group ]

    result = findGenericPilotCredentials( vo = self.vo )
    if not result[ 'OK' ]:
      return result
    self.pilotDN, self.pilotGroup = result[ 'Value' ]
    self.pilotDN = self.am_getOption( "PilotDN", self.pilotDN )
    self.pilotGroup = self.am_getOption( "PilotGroup", self.pilotGroup )

    self.defaultSubmitPools = getSubmitPools( self.group, self.vo )

    self.pilot = self.am_getOption( 'PilotScript', DIRAC_PILOT )
    self.install = DIRAC_INSTALL
    self.extraModules = self.am_getOption( 'ExtraPilotModules', [] ) + DIRAC_MODULES
    self.workingDirectory = self.am_getOption( 'WorkDirectory' )
    self.maxQueueLength = self.am_getOption( 'MaxQueueLength', 86400 * 3 )
    self.pilotLogLevel = self.am_getOption( 'PilotLogLevel', 'INFO' )
    self.maxJobsInFillMode = self.am_getOption( 'MaxJobsInFillMode', self.maxJobsInFillMode )
    self.maxPilotsToSubmit = self.am_getOption( 'MaxPilotsToSubmit', self.maxPilotsToSubmit )
    self.pilotWaitingFlag = self.am_getOption( 'PilotWaitingFlag', True )
    self.pilotWaitingTime = self.am_getOption( 'MaxPilotWaitingTime', 3600 )
    self.failedQueueCycleFactor = self.am_getOption( 'FailedQueueCycleFactor', 10 )
    self.pilotStatusUpdateCycleFactor = self.am_getOption( 'PilotStatusUpdateCycleFactor', 10 )
    self.addPilotsToEmptySites = self.am_getOption( 'AddPilotsToEmptySites', False )

    # Flags
    self.updateStatus = self.am_getOption( 'UpdatePilotStatus', True )
    self.getOutput = self.am_getOption( 'GetPilotOutput', False )
    self.sendAccounting = self.am_getOption( 'SendPilotAccounting', True )

    # Get the site description dictionary
    siteNames = None
    if not self.am_getOption( 'Site', 'Any' ).lower() == "any":
      siteNames = self.am_getOption( 'Site', [] )
      if not siteNames:
        siteNames = None
    ceTypes = None
    if not self.am_getOption( 'CETypes', 'Any' ).lower() == "any":
      ceTypes = self.am_getOption( 'CETypes', [] )
    ces = None
    if not self.am_getOption( 'CEs', 'Any' ).lower() == "any":
      ces = self.am_getOption( 'CEs', [] )
      if not ces:
        ces = None
    result = Resources.getQueues( community = self.vo,
                                  siteList = siteNames,
                                  ceList = ces,
                                  ceTypeList = ceTypes,
                                  mode = 'Direct' )
    if not result['OK']:
      return result
    resourceDict = result['Value']
    result = self.getQueues( resourceDict )
    if not result['OK']:
      return result

    #if not siteNames:
    #  siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' )
    #  if siteName == 'Unknown':
    #    return S_OK( 'No site specified for the SiteDirector' )
    #  else:
    #    siteNames = [siteName]
    #self.siteNames = siteNames

    if self.updateStatus:
      self.log.always( 'Pilot status update requested' )
    if self.getOutput:
      self.log.always( 'Pilot output retrieval requested' )
    if self.sendAccounting:
      self.log.always( 'Pilot accounting sending requested' )

    self.log.always( 'Sites:', siteNames )
    self.log.always( 'CETypes:', ceTypes )
    self.log.always( 'CEs:', ces )
    self.log.always( 'PilotDN:', self.pilotDN )
    self.log.always( 'PilotGroup:', self.pilotGroup )
    self.log.always( 'MaxPilotsToSubmit:', self.maxPilotsToSubmit )
    self.log.always( 'MaxJobsInFillMode:', self.maxJobsInFillMode )

    self.localhost = socket.getfqdn()
    self.proxy = ''

    if self.firstPass:
      if self.queueDict:
        self.log.always( "Agent will serve queues:" )
        for queue in self.queueDict:
          self.log.always( "Site: %s, CE: %s, Queue: %s" % ( self.queueDict[queue]['Site'],
                                                             self.queueDict[queue]['CEName'],
                                                             queue ) )
    self.firstPass = False
    return S_OK()
Ejemplo n.º 18
0
    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {
            'Setup': setup,
            'CPUTime': 9999999,
            'SubmitPool': self.defaultSubmitPools
        }
        if self.vo:
            tqDict['Community'] = self.vo
        if self.voGroups:
            tqDict['OwnerGroup'] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result['OK']:
            return result
        tqDict['Platform'] = result['Value']
        tqDict['Site'] = self.sites
        self.log.verbose('Checking overall TQ availability with requirements')
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result['OK']:
            return result
        if not result['Value']:
            self.log.verbose('No Waiting jobs suitable for the director')
            return S_OK()

        jobSites = set()
        anySite = False
        testSites = set()
        totalWaitingJobs = 0
        for tqID in result['Value']:
            if "Sites" in result['Value'][tqID]:
                for site in result['Value'][tqID]['Sites']:
                    if site.lower() != 'any':
                        jobSites.add(site)
                    else:
                        anySite = True
            else:
                anySite = True
            if "JobTypes" in result['Value'][tqID]:
                if "Sites" in result['Value'][tqID]:
                    for site in result['Value'][tqID]['Sites']:
                        if site.lower() != 'any':
                            testSites.add(site)
            totalWaitingJobs += result['Value'][tqID]['Jobs']

        tqIDList = result['Value'].keys()
        result = pilotAgentsDB.countPilots(
            {
                'TaskQueueID': tqIDList,
                'Status': WAITING_PILOT_STATUS
            }, None)
        totalWaitingPilots = 0
        if result['OK']:
            totalWaitingPilots = result['Value']
        self.log.info(
            'Total %d jobs in %d task queues with %d waiting pilots' %
            (totalWaitingJobs, len(tqIDList), totalWaitingPilots))
        #if totalWaitingPilots >= totalWaitingJobs:
        #  self.log.info( 'No more pilots to be submitted in this cycle' )
        #  return S_OK()

        # Check if the site is allowed in the mask
        result = jobDB.getSiteMask()
        if not result['OK']:
            return S_ERROR('Can not get the site mask')
        siteMaskList = result['Value']

        queues = self.queueDict.keys()
        random.shuffle(queues)
        totalSubmittedPilots = 0
        for queue in queues:

            # Check if the queue failed previously
            failedCount = self.failedQueues.setdefault(
                queue, 0) % self.failedQueueCycleFactor
            if failedCount != 0:
                self.log.warn("%s queue failed recently, skipping %d cycles" %
                              (queue, 10 - failedCount))
                self.failedQueues[queue] += 1
                continue

            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            ceType = self.queueDict[queue]['CEType']
            queueName = self.queueDict[queue]['QueueName']
            siteName = self.queueDict[queue]['Site']
            platform = self.queueDict[queue]['Platform']
            siteMask = siteName in siteMaskList

            if not anySite and siteName not in jobSites:
                self.log.verbose(
                    "Skipping queue %s at site %s since no workload expected" %
                    (queueName, siteName))
                continue
            if not siteMask and siteName not in testSites:
                self.log.verbose(
                    "Skipping queue %s at site %s not in the mask" %
                    (queueName, siteName))
                continue

            if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
                queueCPUTime = int(
                    self.queueDict[queue]['ParametersDict']['CPUTime'])
            else:
                self.log.warn(
                    'CPU time limit is not specified for queue %s, skipping...'
                    % queue)
                continue
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Prepare the queue description to look for eligible jobs
            ceDict = ce.getParameterDict()
            ceDict['GridCE'] = ceName
            #if not siteMask and 'Site' in ceDict:
            #  self.log.info( 'Site not in the mask %s' % siteName )
            #  self.log.info( 'Removing "Site" from matching Dict' )
            #  del ceDict[ 'Site' ]
            if not siteMask:
                ceDict['JobType'] = "Test"
            if self.vo:
                ceDict['Community'] = self.vo
            if self.voGroups:
                ceDict['OwnerGroup'] = self.voGroups

            # This is a hack to get rid of !
            ceDict['SubmitPool'] = self.defaultSubmitPools

            result = Resources.getCompatiblePlatforms(platform)
            if not result['OK']:
                continue
            ceDict['Platform'] = result['Value']

            # Get the number of eligible jobs for the target site/queue
            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.verbose('No matching TQs found for %s' % queue)
                continue

            totalTQJobs = 0
            tqIDList = taskQueueDict.keys()
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]['Jobs']

            self.log.verbose(
                '%d job(s) from %d task queue(s) are eligible for %s queue' %
                (totalTQJobs, len(tqIDList), queue))

            # Get the number of already waiting pilots for these task queues
            totalWaitingPilots = 0
            if self.pilotWaitingFlag:
                lastUpdateTime = dateTime() - self.pilotWaitingTime * second
                result = pilotAgentsDB.countPilots(
                    {
                        'TaskQueueID': tqIDList,
                        'Status': WAITING_PILOT_STATUS
                    }, None, lastUpdateTime)
                if not result['OK']:
                    self.log.error('Failed to get Number of Waiting pilots',
                                   result['Message'])
                    totalWaitingPilots = 0
                else:
                    totalWaitingPilots = result['Value']
                    self.log.verbose(
                        'Waiting Pilots for TaskQueue %s:' % tqIDList,
                        totalWaitingPilots)
            if totalWaitingPilots >= totalTQJobs:
                self.log.verbose(
                    "%d waiting pilots already for all the available jobs" %
                    totalWaitingPilots)
                continue

            self.log.verbose(
                "%d waiting pilots for the total of %d eligible jobs for %s" %
                (totalWaitingPilots, totalTQJobs, queue))

            # Get the working proxy
            cpuTime = queueCPUTime + 86400
            self.log.verbose("Getting pilot proxy for %s/%s %d long" %
                             (self.pilotDN, self.pilotGroup, cpuTime))
            result = gProxyManager.getPilotProxyFromDIRACGroup(
                self.pilotDN, self.pilotGroup, cpuTime)
            if not result['OK']:
                return result
            self.proxy = result['Value']
            ce.setProxy(self.proxy, cpuTime - 60)

            # Get the number of available slots on the target site/queue
            totalSlots = self.__getQueueSlots(queue)
            if totalSlots == 0:
                continue

            pilotsToSubmit = max(
                0, min(totalSlots, totalTQJobs - totalWaitingPilots))
            self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \
                                    ( queue, totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) )

            # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
            pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit)

            while pilotsToSubmit > 0:
                self.log.info('Going to submit %d pilots to %s queue' %
                              (pilotsToSubmit, queue))

                bundleProxy = self.queueDict[queue].get('BundleProxy', False)
                jobExecDir = ''
                if ceType == 'CREAM':
                    jobExecDir = '.'
                jobExecDir = self.queueDict[queue].get('JobExecDir',
                                                       jobExecDir)
                httpProxy = self.queueDict[queue].get('HttpProxy', '')

                result = self.__getExecutable(queue, pilotsToSubmit,
                                              bundleProxy, httpProxy,
                                              jobExecDir)
                if not result['OK']:
                    return result

                executable, pilotSubmissionChunk = result['Value']
                result = ce.submitJob(executable, '', pilotSubmissionChunk)
                os.unlink(executable)
                if not result['OK']:
                    self.log.error('Failed submission to queue %s:\n' % queue,
                                   result['Message'])
                    pilotsToSubmit = 0
                    self.failedQueues[queue] += 1
                    continue

                pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
                # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                # task queue priorities
                pilotList = result['Value']
                self.queueSlots[queue]['AvailableSlots'] -= len(pilotList)
                totalSubmittedPilots += len(pilotList)
                self.log.info('Submitted %d pilots to %s@%s' %
                              (len(pilotList), queueName, ceName))
                stampDict = {}
                if result.has_key('PilotStampDict'):
                    stampDict = result['PilotStampDict']
                tqPriorityList = []
                sumPriority = 0.
                for tq in taskQueueDict:
                    sumPriority += taskQueueDict[tq]['Priority']
                    tqPriorityList.append((tq, sumPriority))
                rndm = random.random() * sumPriority
                tqDict = {}
                for pilotID in pilotList:
                    rndm = random.random() * sumPriority
                    for tq, prio in tqPriorityList:
                        if rndm < prio:
                            tqID = tq
                            break
                    if not tqDict.has_key(tqID):
                        tqDict[tqID] = []
                    tqDict[tqID].append(pilotID)

                for tqID, pilotList in tqDict.items():
                    result = pilotAgentsDB.addPilotTQReference(
                        pilotList, tqID, self.pilotDN, self.pilotGroup,
                        self.localhost, ceType, '', stampDict)
                    if not result['OK']:
                        self.log.error(
                            'Failed add pilots to the PilotAgentsDB: ',
                            result['Message'])
                        continue
                    for pilot in pilotList:
                        result = pilotAgentsDB.setPilotStatus(
                            pilot, 'Submitted', ceName,
                            'Successfully submitted by the SiteDirector',
                            siteName, queueName)
                        if not result['OK']:
                            self.log.error('Failed to set pilot status: ',
                                           result['Message'])
                            continue

        self.log.info("%d pilots submitted in total in this cycle" %
                      totalSubmittedPilots)
        return S_OK()
Ejemplo n.º 19
0
    def getQueues(self, resourceDict):
        """ Get the list of relevant CEs and their descriptions
    """

        self.queueDict = {}
        ceFactory = ComputingElementFactory()

        for site in resourceDict:
            for ce in resourceDict[site]:
                ceDict = resourceDict[site][ce]
                qDict = ceDict.pop("Queues")
                for queue in qDict:
                    queueName = "%s_%s" % (ce, queue)
                    self.queueDict[queueName] = {}
                    self.queueDict[queueName]["ParametersDict"] = qDict[queue]
                    self.queueDict[queueName]["ParametersDict"]["Queue"] = queue
                    self.queueDict[queueName]["ParametersDict"]["Site"] = site
                    self.queueDict[queueName]["ParametersDict"]["GridEnv"] = self.gridEnv
                    self.queueDict[queueName]["ParametersDict"]["Setup"] = gConfig.getValue("/DIRAC/Setup", "unknown")
                    # Evaluate the CPU limit of the queue according to the Glue convention
                    # To Do: should be a utility
                    if (
                        "maxCPUTime" in self.queueDict[queueName]["ParametersDict"]
                        and "SI00" in self.queueDict[queueName]["ParametersDict"]
                    ):
                        maxCPUTime = float(self.queueDict[queueName]["ParametersDict"]["maxCPUTime"])
                        # For some sites there are crazy values in the CS
                        maxCPUTime = max(maxCPUTime, 0)
                        maxCPUTime = min(maxCPUTime, 86400 * 12.5)
                        si00 = float(self.queueDict[queueName]["ParametersDict"]["SI00"])
                        queueCPUTime = 60.0 / 250.0 * maxCPUTime * si00
                        self.queueDict[queueName]["ParametersDict"]["CPUTime"] = int(queueCPUTime)
                    qwDir = os.path.join(self.workingDirectory, queue)
                    if not os.path.exists(qwDir):
                        os.makedirs(qwDir)
                    self.queueDict[queueName]["ParametersDict"]["WorkingDirectory"] = qwDir

                    platform = ""
                    if "Platform" in self.queueDict[queueName]["ParametersDict"]:
                        platform = self.queueDict[queueName]["ParametersDict"]["Platform"]
                    elif "Platform" in ceDict:
                        platform = ceDict["Platform"]
                    elif "OS" in ceDict:
                        architecture = ceDict.get("architecture", "x86_64")
                        OS = ceDict["OS"]
                        platform = "_".join([architecture, OS])
                    if platform and not platform in self.platforms:
                        self.platforms.append(platform)

                    if not "Platform" in self.queueDict[queueName]["ParametersDict"] and platform:
                        result = Resources.getDIRACPlatform(platform)
                        if result["OK"]:
                            self.queueDict[queueName]["ParametersDict"]["Platform"] = result["Value"]

                    ceQueueDict = dict(ceDict)
                    ceQueueDict.update(self.queueDict[queueName]["ParametersDict"])
                    result = ceFactory.getCE(ceName=ce, ceType=ceDict["CEType"], ceParametersDict=ceQueueDict)
                    if not result["OK"]:
                        return result
                    self.queueDict[queueName]["CE"] = result["Value"]
                    self.queueDict[queueName]["CEName"] = ce
                    self.queueDict[queueName]["CEType"] = ceDict["CEType"]
                    self.queueDict[queueName]["Site"] = site
                    self.queueDict[queueName]["QueueName"] = queue
                    result = self.queueDict[queueName]["CE"].isValid()
                    if not result["OK"]:
                        self.log.fatal(result["Message"])
                        return result
                    if "BundleProxy" in self.queueDict[queueName]["ParametersDict"]:
                        self.queueDict[queueName]["BundleProxy"] = True
                    elif "BundleProxy" in ceDict:
                        self.queueDict[queueName]["BundleProxy"] = True

                    if site not in self.sites:
                        self.sites.append(site)

        return S_OK()
Ejemplo n.º 20
0
 def export_getSites( self ):  
   '''
     Returns list of all sites considered by RSS
   '''
   gLogger.info( 'getSites' )
   return Resources.getSites()
  global ceName
  ceName = args

def setSite( args ):
  global Site
  Site = args

def setQueue( args ):
  global Queue
  Queue = args

Script.registerSwitch( "N:", "Name=", "Computing Element Name (Mandatory)", setCEName )
Script.registerSwitch( "S:", "Site=", "Site Name (Mandatory)", setSite )
Script.registerSwitch( "Q:", "Queue=", "Queue Name (Mandatory)", setQueue )


Script.parseCommandLine( ignoreErrors = True )
args = Script.getExtraCLICFGFiles()

if len( args ) > 1:
  Script.showHelp()
  exit( -1 )


result = Resources.getQueue( Site, ceName, Queue )

if not result['OK']:
  gLogger.error( "Could not retrieve resource parameters", ": " + result['Message'] )
  DIRACExit( 1 )
gLogger.notice( json.dumps( result['Value'] ) )
Ejemplo n.º 22
0
    def getImages(self, resourceDict):
        """ Get the list of relevant CEs and their descriptions
    """

        self.imageDict = {}
        ceFactory = EndpointFactory()

        result = getPilotBootstrapParameters(vo=self.vo,
                                             runningPod=self.runningPod)
        if not result['OK']:
            return result
        opParameters = result['Value']

        for site in resourceDict:
            for ce in resourceDict[site]:
                ceDict = resourceDict[site][ce]
                ceTags = ceDict.get('Tag', [])
                if isinstance(ceTags, basestring):
                    ceTags = fromChar(ceTags)
                ceMaxRAM = ceDict.get('MaxRAM', None)
                qDict = ceDict.pop('Images')
                for image in qDict:
                    imageName = '%s_%s' % (ce, image)
                    self.imageDict[imageName] = {}
                    self.imageDict[imageName]['ParametersDict'] = qDict[image]
                    self.imageDict[imageName]['ParametersDict'][
                        'Image'] = image
                    self.imageDict[imageName]['ParametersDict']['Site'] = site
                    self.imageDict[imageName]['ParametersDict'][
                        'Setup'] = gConfig.getValue('/DIRAC/Setup', 'unknown')
                    self.imageDict[imageName]['ParametersDict'][
                        'CPUTime'] = 99999999
                    imageTags = self.imageDict[imageName][
                        'ParametersDict'].get('Tag')
                    if imageTags and isinstance(imageTags, basestring):
                        imageTags = fromChar(imageTags)
                        self.imageDict[imageName]['ParametersDict'][
                            'Tag'] = imageTags
                    if ceTags:
                        if imageTags:
                            allTags = list(set(ceTags + imageTags))
                            self.imageDict[imageName]['ParametersDict'][
                                'Tag'] = allTags
                        else:
                            self.imageDict[imageName]['ParametersDict'][
                                'Tag'] = ceTags

                    maxRAM = self.imageDict[imageName]['ParametersDict'].get(
                        'MaxRAM')
                    maxRAM = ceMaxRAM if not maxRAM else maxRAM
                    if maxRAM:
                        self.imageDict[imageName]['ParametersDict'][
                            'MaxRAM'] = maxRAM

                    platform = ''
                    if "Platform" in self.imageDict[imageName][
                            'ParametersDict']:
                        platform = self.imageDict[imageName]['ParametersDict'][
                            'Platform']
                    elif "Platform" in ceDict:
                        platform = ceDict['Platform']
                    if platform and not platform in self.platforms:
                        self.platforms.append(platform)

                    if not "Platform" in self.imageDict[imageName][
                            'ParametersDict'] and platform:
                        result = Resources.getDIRACPlatform(platform)
                        if result['OK']:
                            self.imageDict[imageName]['ParametersDict'][
                                'Platform'] = result['Value'][0]

                    ceImageDict = dict(ceDict)
                    ceImageDict['CEName'] = ce
                    ceImageDict['VO'] = self.vo
                    ceImageDict['Image'] = image
                    ceImageDict['RunningPod'] = self.runningPod
                    ceImageDict['CSServers'] = gConfig.getValue(
                        "/DIRAC/Configuration/Servers", [])
                    ceImageDict.update(
                        self.imageDict[imageName]['ParametersDict'])
                    ceImageDict.update(opParameters)

                    # Generate the CE object for the image or pick the already existing one
                    # if the image definition did not change
                    imageHash = self.__generateImageHash(ceImageDict)
                    if imageName in self.imageCECache and self.imageCECache[
                            imageName]['Hash'] == imageHash:
                        imageCE = self.imageCECache[imageName]['CE']
                    else:
                        result = ceFactory.getCEObject(parameters=ceImageDict)
                        if not result['OK']:
                            return result
                        self.imageCECache.setdefault(imageName, {})
                        self.imageCECache[imageName]['Hash'] = imageHash
                        self.imageCECache[imageName]['CE'] = result['Value']
                        imageCE = self.imageCECache[imageName]['CE']

                    self.imageDict[imageName]['CE'] = imageCE
                    self.imageDict[imageName]['CEName'] = ce
                    self.imageDict[imageName]['CEType'] = ceDict['CEType']
                    self.imageDict[imageName]['Site'] = site
                    self.imageDict[imageName]['ImageName'] = image
                    self.imageDict[imageName]['Platform'] = platform
                    self.imageDict[imageName]['MaxInstances'] = ceDict[
                        'MaxInstances']
                    if not self.imageDict[imageName]['CE'].isValid():
                        self.log.error(
                            'Failed to instantiate CloudEndpoint for %s' %
                            imageName)
                        continue

                    if site not in self.sites:
                        self.sites.append(site)

        return S_OK()
Ejemplo n.º 23
0
  def submitJobs( self ):
    """ Go through defined computing elements and submit jobs if necessary
    """

    # Check that there is some work at all
    setup = CSGlobals.getSetup()
    tqDict = { 'Setup':setup,
               'CPUTime': 9999999,
               'SubmitPool' : self.defaultSubmitPools }
    if self.vo:
      tqDict['Community'] = self.vo
    if self.voGroups:
      tqDict['OwnerGroup'] = self.voGroups

    result = Resources.getCompatiblePlatforms( self.platforms )
    if not result['OK']:
      return result
    tqDict['Platform'] = result['Value']
    tqDict['Site'] = self.sites
    tqDict['Tag'] = []
    self.log.verbose( 'Checking overall TQ availability with requirements' )
    self.log.verbose( tqDict )

    rpcMatcher = RPCClient( "WorkloadManagement/Matcher" )
    result = rpcMatcher.getMatchingTaskQueues( tqDict )
    if not result[ 'OK' ]:
      return result
    if not result['Value']:
      self.log.verbose( 'No Waiting jobs suitable for the director' )
      return S_OK()

    jobSites = set()
    anySite = False
    testSites = set()
    totalWaitingJobs = 0
    for tqID in result['Value']:
      if "Sites" in result['Value'][tqID]:
        for site in result['Value'][tqID]['Sites']:
          if site.lower() != 'any':
            jobSites.add( site )
          else:
            anySite = True
      else:
        anySite = True
      if "JobTypes" in result['Value'][tqID]:
        if "Sites" in result['Value'][tqID]:
          for site in result['Value'][tqID]['Sites']:
            if site.lower() != 'any':
              testSites.add( site )
      totalWaitingJobs += result['Value'][tqID]['Jobs']

    tqIDList = result['Value'].keys()
    result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList,
                                          'Status': WAITING_PILOT_STATUS },
                                           None )
    totalWaitingPilots = 0
    if result['OK']:
      totalWaitingPilots = result['Value']
    self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len( tqIDList ), totalWaitingPilots ) )
    #if totalWaitingPilots >= totalWaitingJobs:
    #  self.log.info( 'No more pilots to be submitted in this cycle' )
    #  return S_OK()

    # Check if the site is allowed in the mask
    result = jobDB.getSiteMask()
    if not result['OK']:
      return S_ERROR( 'Can not get the site mask' )
    siteMaskList = result['Value']

    queues = self.queueDict.keys()
    random.shuffle( queues )
    totalSubmittedPilots = 0
    matchedQueues = 0
    for queue in queues:

      # Check if the queue failed previously
      failedCount = self.failedQueues.setdefault( queue, 0 ) % self.failedQueueCycleFactor
      if failedCount != 0:
        self.log.warn( "%s queue failed recently, skipping %d cycles" % ( queue, 10-failedCount ) )
        self.failedQueues[queue] += 1
        continue

      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      ceType = self.queueDict[queue]['CEType']
      queueName = self.queueDict[queue]['QueueName']
      siteName = self.queueDict[queue]['Site']
      platform = self.queueDict[queue]['Platform']
      siteMask = siteName in siteMaskList

      if not anySite and siteName not in jobSites:
        self.log.verbose( "Skipping queue %s at %s: no workload expected" % (queueName, siteName) )
        continue
      if not siteMask and siteName not in testSites:
        self.log.verbose( "Skipping queue %s at site %s not in the mask" % (queueName, siteName) )
        continue

      if 'CPUTime' in self.queueDict[queue]['ParametersDict'] :
        queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] )
      else:
        self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue )
        continue
      if queueCPUTime > self.maxQueueLength:
        queueCPUTime = self.maxQueueLength

      # Prepare the queue description to look for eligible jobs
      ceDict = ce.getParameterDict()
      ceDict[ 'GridCE' ] = ceName
      #if not siteMask and 'Site' in ceDict:
      #  self.log.info( 'Site not in the mask %s' % siteName )
      #  self.log.info( 'Removing "Site" from matching Dict' )
      #  del ceDict[ 'Site' ]
      if not siteMask:
        ceDict['JobType'] = "Test"
      if self.vo:
        ceDict['Community'] = self.vo
      if self.voGroups:
        ceDict['OwnerGroup'] = self.voGroups

      # This is a hack to get rid of !
      ceDict['SubmitPool'] = self.defaultSubmitPools
      
      result = Resources.getCompatiblePlatforms( platform )
      if not result['OK']:
        continue
      ceDict['Platform'] = result['Value']

      # Get the number of eligible jobs for the target site/queue
      result = rpcMatcher.getMatchingTaskQueues( ceDict )
      if not result['OK']:
        self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] )
        return result
      taskQueueDict = result['Value']
      if not taskQueueDict:
        self.log.verbose( 'No matching TQs found for %s' % queue )
        continue

      matchedQueues += 1
      totalTQJobs = 0
      tqIDList = taskQueueDict.keys()
      for tq in taskQueueDict:
        totalTQJobs += taskQueueDict[tq]['Jobs']

      self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len( tqIDList ), queue) )

      # Get the number of already waiting pilots for these task queues
      totalWaitingPilots = 0
      if self.pilotWaitingFlag:
        lastUpdateTime = dateTime() - self.pilotWaitingTime * second
        result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList,
                                              'Status': WAITING_PILOT_STATUS },
                                              None, lastUpdateTime )
        if not result['OK']:
          self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] )
          totalWaitingPilots = 0
        else:
          totalWaitingPilots = result['Value']
          self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots )
      if totalWaitingPilots >= totalTQJobs:
        self.log.verbose( "%d waiting pilots already for all the available jobs" % totalWaitingPilots )
        continue

      self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % (totalWaitingPilots, totalTQJobs, queue) )

      # Get the working proxy
      cpuTime = queueCPUTime + 86400
      self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) )
      result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime )
      if not result['OK']:
        return result
      self.proxy = result['Value']
      ce.setProxy( self.proxy, cpuTime - 60 )

      # Get the number of available slots on the target site/queue
      totalSlots = self.__getQueueSlots( queue )
      if totalSlots == 0:
        self.log.debug( '%s: No slots available' % queue )
        continue

      pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) )
      self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \
                              ( queue, totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) )

      # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
      pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit )

      while pilotsToSubmit > 0:
        self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) )

        bundleProxy = self.queueDict[queue].get( 'BundleProxy', False )
        jobExecDir = ''
        if ceType == 'CREAM':
          jobExecDir = '.'
        jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir )
        httpProxy = self.queueDict[queue].get( 'HttpProxy', '' )

        result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir )
        if not result['OK']:
          return result

        executable, pilotSubmissionChunk = result['Value']
        result = ce.submitJob( executable, '', pilotSubmissionChunk )
        os.unlink( executable )
        if not result['OK']:
          self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] )
          pilotsToSubmit = 0
          self.failedQueues[queue] += 1
          continue

        pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
        # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
        # task queue priorities
        pilotList = result['Value']
        self.queueSlots[queue]['AvailableSlots'] -= len( pilotList )
        totalSubmittedPilots += len( pilotList )
        self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) )
        stampDict = {}
        if result.has_key( 'PilotStampDict' ):
          stampDict = result['PilotStampDict']
        tqPriorityList = []
        sumPriority = 0.
        for tq in taskQueueDict:
          sumPriority += taskQueueDict[tq]['Priority']
          tqPriorityList.append( ( tq, sumPriority ) )
        rndm = random.random()*sumPriority
        tqDict = {}
        for pilotID in pilotList:
          rndm = random.random() * sumPriority
          for tq, prio in tqPriorityList:
            if rndm < prio:
              tqID = tq
              break
          if not tqDict.has_key( tqID ):
            tqDict[tqID] = []
          tqDict[tqID].append( pilotID )

        for tqID, pilotList in tqDict.items():
          result = pilotAgentsDB.addPilotTQReference( pilotList,
                                                      tqID,
                                                      self.pilotDN,
                                                      self.pilotGroup,
                                                      self.localhost,
                                                      ceType,
                                                      '',
                                                      stampDict )
          if not result['OK']:
            self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] )
            continue
          for pilot in pilotList:
            result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName,
                                                  'Successfully submitted by the SiteDirector',
                                                  siteName, queueName )
            if not result['OK']:
              self.log.error( 'Failed to set pilot status: ', result['Message'] )
              continue

    self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % ( totalSubmittedPilots, matchedQueues ) )
    return S_OK()
Ejemplo n.º 24
0
  def submitJobs( self ):
    """ Go through defined computing elements and submit jobs if necessary
    """

    # Check that there is some work at all
    setup = CSGlobals.getSetup()
    tqDict = { 'Setup':setup,
               'CPUTime': 9999999,
               'SubmitPool' : self.defaultSubmitPools }
    if self.vo:
      tqDict['Community'] = self.vo
    if self.voGroups:
      tqDict['OwnerGroup'] = self.voGroups

    result = Resources.getCompatiblePlatforms( self.platforms )
    if not result['OK']:
      return result
    tqDict['Platform'] = result['Value']
    tqDict['Site'] = self.sites

    self.log.verbose( 'Checking overall TQ availability with requirements' )
    self.log.verbose( tqDict )

    rpcMatcher = RPCClient( "WorkloadManagement/Matcher" )
    result = rpcMatcher.getMatchingTaskQueues( tqDict )
    if not result[ 'OK' ]:
      return result
    if not result['Value']:
      self.log.verbose( 'No Waiting jobs suitable for the director' )
      return S_OK()

    queues = self.queueDict.keys()
    random.shuffle( queues )
    for queue in queues:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      ceType = self.queueDict[queue]['CEType']
      queueName = self.queueDict[queue]['QueueName']
      siteName = self.queueDict[queue]['Site']
      siteMask = self.siteStatus.isUsableSite( siteName, 'ComputingAccess' )
      platform = self.queueDict[queue]['Platform']

      if 'CPUTime' in self.queueDict[queue]['ParametersDict'] :
        queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] )
      else:
        self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue )
        continue
      if queueCPUTime > self.maxQueueLength:
        queueCPUTime = self.maxQueueLength

      # Get the working proxy
      cpuTime = queueCPUTime + 86400

      self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) )
      result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime )
      if not result['OK']:
        return result
      self.proxy = result['Value']
      ce.setProxy( self.proxy, cpuTime - 60 )

      # Get the number of available slots on the target site/queue
      result = ce.available()
      if not result['OK']:
        self.log.warn( 'Failed to check the availability of queue %s: \n%s' % ( queue, result['Message'] ) )
        continue
      ceInfoDict = result['CEInfoDict']
      self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \
                     ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'],
                       ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) )

      totalSlots = result['Value']

      ceDict = ce.getParameterDict()
      ceDict[ 'GridCE' ] = ceName
      if not siteMask and 'Site' in ceDict:
        self.log.info( 'Site not in the mask %s' % siteName )
        self.log.info( 'Removing "Site" from matching Dict' )
        del ceDict[ 'Site' ]
      if self.vo:
        ceDict['Community'] = self.vo
      if self.voGroups:
        ceDict['OwnerGroup'] = self.voGroups

      # This is a hack to get rid of !
      ceDict['SubmitPool'] = self.defaultSubmitPools

      result = Resources.getCompatiblePlatforms( platform )
      if not result['OK']:
        continue
      ceDict['Platform'] = result['Value']

      # Get the number of eligible jobs for the target site/queue
      result = rpcMatcher.getMatchingTaskQueues( ceDict )
      if not result['OK']:
        self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] )
        return result
      taskQueueDict = result['Value']
      if not taskQueueDict:
        self.log.info( 'No matching TQs found' )
        continue

      totalTQJobs = 0
      tqIDList = taskQueueDict.keys()
      for tq in taskQueueDict:
        totalTQJobs += taskQueueDict[tq]['Jobs']

      pilotsToSubmit = min( totalSlots, totalTQJobs )

      # Get the number of already waiting pilots for this queue
      totalWaitingPilots = 0
      if self.pilotWaitingFlag:
        lastUpdateTime = dateTime() - self.pilotWaitingTime * second
        result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList,
                                              'Status': WAITING_PILOT_STATUS },
                                            None, lastUpdateTime )
        if not result['OK']:
          self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] )
          totalWaitingPilots = 0
        else:
          totalWaitingPilots = result['Value']
          self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots )

      pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) )
      self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \
                              ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) )

      # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
      pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit )

      while pilotsToSubmit > 0:
        self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) )

        bundleProxy = self.queueDict[queue].get( 'BundleProxy', False )
        jobExecDir = ''
        if ceType == 'CREAM':
          jobExecDir = '.'
        jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir )
        httpProxy = self.queueDict[queue].get( 'HttpProxy', '' )

        result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir )
        if not result['OK']:
          return result

        executable, pilotSubmissionChunk = result['Value']
        result = ce.submitJob( executable, '', pilotSubmissionChunk )
        os.unlink( executable )
        if not result['OK']:
          self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] )
          pilotsToSubmit = 0
          continue

        pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
        # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
        # task queue priorities
        pilotList = result['Value']
        self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) )
        stampDict = {}
        if result.has_key( 'PilotStampDict' ):
          stampDict = result['PilotStampDict']
        tqPriorityList = []
        sumPriority = 0.
        for tq in taskQueueDict:
          sumPriority += taskQueueDict[tq]['Priority']
          tqPriorityList.append( ( tq, sumPriority ) )
        rndm = random.random()*sumPriority
        tqDict = {}
        for pilotID in pilotList:
          rndm = random.random()*sumPriority
          for tq, prio in tqPriorityList:
            if rndm < prio:
              tqID = tq
              break
          if not tqDict.has_key( tqID ):
            tqDict[tqID] = []
          tqDict[tqID].append( pilotID )

        for tqID, pilotList in tqDict.items():
          result = pilotAgentsDB.addPilotTQReference( pilotList,
                                                     tqID,
                                                     self.pilotDN,
                                                     self.pilotGroup,
                                                     self.localhost,
                                                     ceType,
                                                     '',
                                                     stampDict )
          if not result['OK']:
            self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] )
            continue
          for pilot in pilotList:
            result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName,
                                                  'Successfully submitted by the SiteDirector',
                                                  siteName, queueName )
            if not result['OK']:
              self.log.error( 'Failed to set pilot status: ', result['Message'] )
              continue

    return S_OK()
Ejemplo n.º 25
0
  def submitJobs(self):
    """ Go through defined computing elements and submit jobs if necessary
    """

    queues = self.queueDict.keys()

    # Check that there is some work at all
    setup = CSGlobals.getSetup()
    tqDict = {'Setup': setup,
              'CPUTime': 9999999,
              'SubmitPool': self.defaultSubmitPools}
    if self.vo:
      tqDict['Community'] = self.vo
    if self.voGroups:
      tqDict['OwnerGroup'] = self.voGroups

    result = Resources.getCompatiblePlatforms(self.platforms)
    if not result['OK']:
      return result
    tqDict['Platform'] = result['Value']
    tqDict['Site'] = self.sites
    tags = []
    for queue in queues:
      tags += self.queueDict[queue]['ParametersDict']['Tag']
    tqDict['Tag'] = list(set(tags))

    self.log.verbose('Checking overall TQ availability with requirements')
    self.log.verbose(tqDict)

    rpcMatcher = RPCClient("WorkloadManagement/Matcher")
    result = rpcMatcher.getMatchingTaskQueues(tqDict)
    if not result['OK']:
      return result
    if not result['Value']:
      self.log.verbose('No Waiting jobs suitable for the director')
      return S_OK()

    jobSites = set()
    anySite = False
    testSites = set()
    totalWaitingJobs = 0
    for tqID in result['Value']:
      if "Sites" in result['Value'][tqID]:
        for site in result['Value'][tqID]['Sites']:
          if site.lower() != 'any':
            jobSites.add(site)
          else:
            anySite = True
      else:
        anySite = True
      if "JobTypes" in result['Value'][tqID]:
        if "Sites" in result['Value'][tqID]:
          for site in result['Value'][tqID]['Sites']:
            if site.lower() != 'any':
              testSites.add(site)
      totalWaitingJobs += result['Value'][tqID]['Jobs']

    tqIDList = result['Value'].keys()
    self.log.info(tqIDList)
    result = pilotAgentsDB.countPilots({'TaskQueueID': tqIDList,
                                        'Status': WAITING_PILOT_STATUS},
                                       None)
    tagWaitingPilots = 0
    if result['OK']:
      tagWaitingPilots = result['Value']
    self.log.info('Total %d jobs in %d task queues with %d waiting pilots' %
                  (totalWaitingJobs, len(tqIDList), tagWaitingPilots))
    self.log.info('Queues: ', self.queueDict.keys())
    # if tagWaitingPilots >= totalWaitingJobs:
    #  self.log.info( 'No more pilots to be submitted in this cycle' )
    #  return S_OK()

    result = self.siteClient.getUsableSites()
    if not result['OK']:
      return result
    siteMaskList = result['Value']

    queues = self.queueDict.keys()
    random.shuffle(queues)
    totalSubmittedPilots = 0
    matchedQueues = 0
    for queue in queues:

      # Check if the queue failed previously
      failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor
      if failedCount != 0:
        self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount))
        self.failedQueues[queue] += 1
        continue

      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      ceType = self.queueDict[queue]['CEType']
      queueName = self.queueDict[queue]['QueueName']
      siteName = self.queueDict[queue]['Site']
      platform = self.queueDict[queue]['Platform']
      queueTags = self.queueDict[queue]['ParametersDict']['Tag']
      siteMask = siteName in siteMaskList
      processorTags = []

      # Check the status of the Site
      result = self.siteClient.getUsableSites(siteName)
      if not result['OK']:
        self.log.error("Can not get the status of site %s: %s" %
                       (siteName, result['Message']))
        continue
      if siteName not in result.get('Value', []):
        self.log.info("site %s is not active" % siteName)
        continue

      if self.rssFlag:
        # Check the status of the ComputingElement
        result = self.rssClient.getElementStatus(ceName, "ComputingElement")
        if not result['OK']:
          self.log.error("Can not get the status of computing element",
                         " %s: %s" % (siteName, result['Message']))
          continue
        if result['Value']:
          # get the value of the status
          result = result['Value'][ceName]['all']

        if result not in ('Active', 'Degraded'):
          self.log.verbose(
              "Skipping computing element %s at %s: resource not usable" % (ceName, siteName))
          continue

      for tag in queueTags:
        if re.match(r'^[0-9]+Processors$', tag):
          processorTags.append(tag)
      if 'WholeNode' in queueTags:
        processorTags.append('WholeNode')

      if not anySite and siteName not in jobSites:
        self.log.verbose("Skipping queue %s at %s: no workload expected" % (queueName, siteName))
        continue
      if not siteMask and siteName not in testSites:
        self.log.verbose("Skipping queue %s at site %s not in the mask" % (queueName, siteName))
        continue

      if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
        queueCPUTime = int(self.queueDict[queue]['ParametersDict']['CPUTime'])
      else:
        self.log.warn('CPU time limit is not specified for queue %s, skipping...' % queue)
        continue
      if queueCPUTime > self.maxQueueLength:
        queueCPUTime = self.maxQueueLength

      # Prepare the queue description to look for eligible jobs
      ceDict = ce.getParameterDict()
      ceDict['GridCE'] = ceName
      # if not siteMask and 'Site' in ceDict:
      #  self.log.info( 'Site not in the mask %s' % siteName )
      #  self.log.info( 'Removing "Site" from matching Dict' )
      #  del ceDict[ 'Site' ]
      if not siteMask:
        ceDict['JobType'] = "Test"
      if self.vo:
        ceDict['Community'] = self.vo
      if self.voGroups:
        ceDict['OwnerGroup'] = self.voGroups

      # This is a hack to get rid of !
      ceDict['SubmitPool'] = self.defaultSubmitPools

      result = Resources.getCompatiblePlatforms(platform)
      if not result['OK']:
        continue
      ceDict['Platform'] = result['Value']

      ceDict['Tag'] = queueTags
      # Get the number of eligible jobs for the target site/queue
      result = rpcMatcher.getMatchingTaskQueues(ceDict)
      if not result['OK']:
        self.log.error('Could not retrieve TaskQueues from TaskQueueDB', result['Message'])
        return result
      taskQueueDict = result['Value']
      if not taskQueueDict:
        self.log.verbose('No matching TQs found for %s' % queue)
        continue

      matchedQueues += 1
      totalTQJobs = 0
      totalTQJobsByProcessors = {}
      tqIDList = taskQueueDict.keys()
      tqIDListByProcessors = {}
      for tq in taskQueueDict:
        if 'Tags' not in taskQueueDict[tq]:
          # skip non multiprocessor tqs
          continue
        for tag in taskQueueDict[tq]['Tags']:
          if tag in processorTags:
            tqIDListByProcessors.setdefault(tag, [])
            tqIDListByProcessors[tag].append(tq)

            totalTQJobsByProcessors.setdefault(tag, 0)
            totalTQJobsByProcessors[tag] += taskQueueDict[tq]['Jobs']

        totalTQJobs += taskQueueDict[tq]['Jobs']

      self.log.verbose('%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs,
                                                                                      len(tqIDList), queue))

      queueSubmittedPilots = 0
      for tag in tqIDListByProcessors:

        self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" % (tag, tqIDListByProcessors[tag]))

        processors = 1

        m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag)
        if m:
          processors = int(m.group('processors'))
        if tag == 'WholeNode':
          processors = -1

        tagTQJobs = totalTQJobsByProcessors[tag]
        tagTqIDList = tqIDListByProcessors[tag]

        # Get the number of already waiting pilots for these task queues
        tagWaitingPilots = 0
        if self.pilotWaitingFlag:
          lastUpdateTime = dateTime() - self.pilotWaitingTime * second
          result = pilotAgentsDB.countPilots({'TaskQueueID': tagTqIDList,
                                              'Status': WAITING_PILOT_STATUS},
                                             None, lastUpdateTime)
          if not result['OK']:
            self.log.error('Failed to get Number of Waiting pilots', result['Message'])
            tagWaitingPilots = 0
          else:
            tagWaitingPilots = result['Value']
            self.log.verbose('Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots)
        if tagWaitingPilots >= tagTQJobs:
          self.log.verbose("%d waiting pilots already for all the available jobs" % tagWaitingPilots)
          continue

        self.log.verbose("%d waiting pilots for the total of %d eligible jobs for %s" % (tagWaitingPilots,
                                                                                         tagTQJobs, queue))

        # Get the working proxy
        cpuTime = queueCPUTime + 86400
        self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime))
        result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime)
        if not result['OK']:
          return result
        self.proxy = result['Value']
        ce.setProxy(self.proxy, cpuTime - 60)

        # Get the number of available slots on the target site/queue
        totalSlots = self.getQueueSlots(queue, False)
        if totalSlots == 0:
          self.log.debug('%s: No slots available' % queue)
          continue

        # Note: comparing slots to job numbers is not accurate in multiprocessor case.
        #       This could lead to over submission.
        pilotsToSubmit = max(0, min(totalSlots, tagTQJobs - tagWaitingPilots))
        self.log.info('%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' %
                      (queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit))

        # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
        pilotsToSubmit = min(self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit)

        while pilotsToSubmit > 0:
          self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue))

          bundleProxy = self.queueDict[queue].get('BundleProxy', False)
          jobExecDir = ''
          jobExecDir = self.queueDict[queue]['ParametersDict'].get('JobExecDir', jobExecDir)

          executable, pilotSubmissionChunk = self.getExecutable(queue, pilotsToSubmit,
                                                                bundleProxy=bundleProxy,
                                                                jobExecDir=jobExecDir,
                                                                processors=processors)
          result = ce.submitJob(executable, '', pilotSubmissionChunk, processors=processors)
          # ## FIXME: The condor thing only transfers the file with some
          # ## delay, so when we unlink here the script is gone
          # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts...
          if ceType != 'HTCondorCE':
            os.unlink(executable)
          if not result['OK']:
            self.log.error('Failed submission to queue %s:\n' % queue, result['Message'])
            pilotsToSubmit = 0
            self.failedQueues[queue] += 1
            continue

          pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
          queueSubmittedPilots += pilotSubmissionChunk
          # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
          # task queue priorities
          pilotList = result['Value']
          self.queueSlots[queue]['AvailableSlots'] -= len(pilotList)
          totalSubmittedPilots += len(pilotList)
          self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName))
          stampDict = {}
          if 'PilotStampDict' in result:
            stampDict = result['PilotStampDict']
          tqPriorityList = []
          sumPriority = 0.
          for tq in tagTqIDList:
            sumPriority += taskQueueDict[tq]['Priority']
            tqPriorityList.append((tq, sumPriority))
          rndm = random.random() * sumPriority
          tqDict = {}
          for pilotID in pilotList:
            rndm = random.random() * sumPriority
            for tq, prio in tqPriorityList:
              if rndm < prio:
                tqID = tq
                break
            if tqID not in tqDict:
              tqDict[tqID] = []
            tqDict[tqID].append(pilotID)

          for tqID, pilotList in tqDict.items():
            result = pilotAgentsDB.addPilotTQReference(pilotList,
                                                       tqID,
                                                       self.pilotDN,
                                                       self.pilotGroup,
                                                       self.localhost,
                                                       ceType,
                                                       stampDict)
            if not result['OK']:
              self.log.error('Failed add pilots to the PilotAgentsDB: ', result['Message'])
              continue
            for pilot in pilotList:
              result = pilotAgentsDB.setPilotStatus(pilot, 'Submitted', ceName,
                                                    'Successfully submitted by the SiteDirector',
                                                    siteName, queueName)
              if not result['OK']:
                self.log.error('Failed to set pilot status: ', result['Message'])
                continue

    self.log.info(
        "%d pilots submitted in total in this cycle, %d matched queues" %
        (totalSubmittedPilots, matchedQueues))
    return S_OK()
Ejemplo n.º 26
0
def main():
    global fullMatch
    global sites
    Script.registerSwitch("F", "full-match", "Check all the matching criteria",
                          setFullMatch)
    Script.registerSwitch(
        "S:", "site=", "Check matching for these sites (comma separated list)",
        setSites)

    Script.parseCommandLine(ignoreErrors=True)
    args = Script.getPositionalArgs()

    if len(args) == 0:
        gLogger.error("Error: No job description provided")
        Script.showHelp(exitCode=1)

    from DIRAC.Core.Security.ProxyInfo import getVOfromProxyGroup
    from DIRAC.ConfigurationSystem.Client.Helpers import Resources
    from DIRAC.Core.Utilities.PrettyPrint import printTable
    from DIRAC.ResourceStatusSystem.Client.ResourceStatus import ResourceStatus
    from DIRAC.ResourceStatusSystem.Client.SiteStatus import SiteStatus
    from DIRAC.WorkloadManagementSystem.Utilities.QueueUtilities import getQueuesResolved, matchQueue

    with open(args[0]) as f:
        jdl = f.read()

    # Get the current VO
    result = getVOfromProxyGroup()
    if not result['OK']:
        gLogger.error('No proxy found, please login')
        DIRACExit(-1)
    voName = result['Value']

    resultQueues = Resources.getQueues(siteList=sites, community=voName)
    if not resultQueues['OK']:
        gLogger.error('Failed to get CE information')
        DIRACExit(-1)
    siteDict = resultQueues['Value']
    result = getQueuesResolved(siteDict)
    if not resultQueues['OK']:
        gLogger.error('Failed to get CE information')
        DIRACExit(-1)
    queueDict = result['Value']

    # get list of usable sites within this cycle
    resultMask = SiteStatus().getUsableSites()
    if not resultMask['OK']:
        gLogger.error('Failed to get Site mask information')
        DIRACExit(-1)
    siteMaskList = resultMask.get('Value', [])

    rssClient = ResourceStatus()

    fields = ('Site', 'CE', 'Queue', 'Status', 'Match', 'Reason')
    records = []

    for queue, queueInfo in queueDict.items():
        site = queueInfo['Site']
        ce = queueInfo['CEName']
        siteStatus = "Active" if site in siteMaskList else "InActive"
        ceStatus = siteStatus
        if rssClient.rssFlag:
            result = rssClient.getElementStatus(ce, "ComputingElement")
            if result['OK']:
                ceStatus = result['Value'][ce]['all']

        result = matchQueue(jdl, queueInfo, fullMatch=fullMatch)
        if not result['OK']:
            gLogger.error('Failed in getting match data', result['Message'])
            DIRACExit(-1)
        status = "Active" if siteStatus == "Active" and ceStatus == "Active" else "Inactive"
        if result['Value']['Match']:
            records.append((site, ce, queueInfo['Queue'], status, 'Yes', ''))
        else:
            records.append((site, ce, queueInfo['Queue'], status, 'No',
                            result['Value']['Reason']))

    gLogger.notice(
        printTable(fields,
                   records,
                   sortField='Site',
                   columnSeparator='  ',
                   printOut=False))
Ejemplo n.º 27
0
    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {"Setup": setup, "CPUTime": 9999999, "SubmitPool": self.defaultSubmitPools}
        if self.vo:
            tqDict["Community"] = self.vo
        if self.voGroups:
            tqDict["OwnerGroup"] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result["OK"]:
            return result
        tqDict["Platform"] = result["Value"]
        tqDict["Site"] = self.sites

        self.log.verbose("Checking overall TQ availability with requirements")
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result["OK"]:
            return result
        if not result["Value"]:
            self.log.verbose("No Waiting jobs suitable for the director")
            return S_OK()

        # Check if the site is allowed in the mask
        result = jobDB.getSiteMask()
        if not result["OK"]:
            return S_ERROR("Can not get the site mask")
        siteMaskList = result["Value"]

        queues = self.queueDict.keys()
        random.shuffle(queues)
        for queue in queues:
            ce = self.queueDict[queue]["CE"]
            ceName = self.queueDict[queue]["CEName"]
            ceType = self.queueDict[queue]["CEType"]
            queueName = self.queueDict[queue]["QueueName"]
            siteName = self.queueDict[queue]["Site"]
            siteMask = siteName in siteMaskList

            if "CPUTime" in self.queueDict[queue]["ParametersDict"]:
                queueCPUTime = int(self.queueDict[queue]["ParametersDict"]["CPUTime"])
            else:
                self.log.warn("CPU time limit is not specified for queue %s, skipping..." % queue)
                continue
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Get the working proxy
            cpuTime = queueCPUTime + 86400

            self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime))
            result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime)
            if not result["OK"]:
                return result
            self.proxy = result["Value"]
            ce.setProxy(self.proxy, cpuTime - 60)

            # Get the number of available slots on the target site/queue
            result = ce.available()
            if not result["OK"]:
                self.log.warn("Failed to check the availability of queue %s: \n%s" % (queue, result["Message"]))
                continue
            ceInfoDict = result["CEInfoDict"]
            self.log.info(
                "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d"
                % (
                    ceName,
                    queueName,
                    ceInfoDict["WaitingJobs"],
                    ceInfoDict["RunningJobs"],
                    ceInfoDict["SubmittedJobs"],
                    ceInfoDict["MaxTotalJobs"],
                )
            )

            totalSlots = result["Value"]

            ceDict = ce.getParameterDict()
            ceDict["GridCE"] = ceName
            if not siteMask and "Site" in ceDict:
                self.log.info("Site not in the mask %s" % siteName)
                self.log.info('Removing "Site" from matching Dict')
                del ceDict["Site"]
            if self.vo:
                ceDict["Community"] = self.vo
            if self.voGroups:
                ceDict["OwnerGroup"] = self.voGroups

            # This is a hack to get rid of !
            ceDict["SubmitPool"] = self.defaultSubmitPools

            result = Resources.getCompatiblePlatforms(self.platforms)
            if not result["OK"]:
                continue
            ceDict["Platform"] = result["Value"]

            # Get the number of eligible jobs for the target site/queue
            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result["OK"]:
                self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"])
                return result
            taskQueueDict = result["Value"]
            if not taskQueueDict:
                self.log.info("No matching TQs found")
                continue

            totalTQJobs = 0
            tqIDList = taskQueueDict.keys()
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]["Jobs"]

            pilotsToSubmit = min(totalSlots, totalTQJobs)

            # Get the number of already waiting pilots for this queue
            totalWaitingPilots = 0
            if self.pilotWaitingFlag:
                lastUpdateTime = dateTime() - self.pilotWaitingTime * second
                result = pilotAgentsDB.countPilots(
                    {"TaskQueueID": tqIDList, "Status": WAITING_PILOT_STATUS}, None, lastUpdateTime
                )
                if not result["OK"]:
                    self.log.error("Failed to get Number of Waiting pilots", result["Message"])
                    totalWaitingPilots = 0
                else:
                    totalWaitingPilots = result["Value"]
                    self.log.verbose("Waiting Pilots for TaskQueue %s:" % tqIDList, totalWaitingPilots)

            pilotsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingPilots))
            self.log.info(
                "Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d"
                % (totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit)
            )

            # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
            pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit)

            while pilotsToSubmit > 0:
                self.log.info("Going to submit %d pilots to %s queue" % (pilotsToSubmit, queue))

                bundleProxy = self.queueDict[queue].get("BundleProxy", False)
                jobExecDir = ""
                if ceType == "CREAM":
                    jobExecDir = "."
                jobExecDir = self.queueDict[queue].get("JobExecDir", jobExecDir)
                httpProxy = self.queueDict[queue].get("HttpProxy", "")

                result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir)
                if not result["OK"]:
                    return result

                executable, pilotSubmissionChunk = result["Value"]
                result = ce.submitJob(executable, "", pilotSubmissionChunk)
                os.unlink(executable)
                if not result["OK"]:
                    self.log.error("Failed submission to queue %s:\n" % queue, result["Message"])
                    pilotsToSubmit = 0
                    continue

                pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
                # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                # task queue priorities
                pilotList = result["Value"]
                self.log.info("Submitted %d pilots to %s@%s" % (len(pilotList), queueName, ceName))
                stampDict = {}
                if result.has_key("PilotStampDict"):
                    stampDict = result["PilotStampDict"]
                tqPriorityList = []
                sumPriority = 0.0
                for tq in taskQueueDict:
                    sumPriority += taskQueueDict[tq]["Priority"]
                    tqPriorityList.append((tq, sumPriority))
                rndm = random.random() * sumPriority
                tqDict = {}
                for pilotID in pilotList:
                    rndm = random.random() * sumPriority
                    for tq, prio in tqPriorityList:
                        if rndm < prio:
                            tqID = tq
                            break
                    if not tqDict.has_key(tqID):
                        tqDict[tqID] = []
                    tqDict[tqID].append(pilotID)

                for tqID, pilotList in tqDict.items():
                    result = pilotAgentsDB.addPilotTQReference(
                        pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, "", stampDict
                    )
                    if not result["OK"]:
                        self.log.error("Failed add pilots to the PilotAgentsDB: ", result["Message"])
                        continue
                    for pilot in pilotList:
                        result = pilotAgentsDB.setPilotStatus(
                            pilot,
                            "Submitted",
                            ceName,
                            "Successfully submitted by the SiteDirector",
                            siteName,
                            queueName,
                        )
                        if not result["OK"]:
                            self.log.error("Failed to set pilot status: ", result["Message"])
                            continue

        return S_OK()
def setSite( args ):
  global Site
  Site = args

def setQueue( args ):
  global Queue
  Queue = args

Script.registerSwitch( "N:", "Name=", "Computing Element Name (Mandatory)", setCEName )
Script.registerSwitch( "S:", "Site=", "Site Name (Mandatory)", setSite )
Script.registerSwitch( "Q:", "Queue=", "Queue Name (Mandatory)", setQueue )


Script.parseCommandLine( ignoreErrors = True )
args = Script.getExtraCLICFGFiles()

if len( args ) > 1:
  Script.showHelp()
  exit( -1 )


result = Resources.getQueue( Site, ceName, Queue )
if not result['OK']:
  gLogger.error( "Could not retrieve resource parameters", ": " + result['Message'] )
  DIRACExit( 1 )
gLogger.notice( json.dumps( result['Value'] ) )




Ejemplo n.º 29
0
    def createVMs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        images = self.imageDict.keys()

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {'Setup': setup, 'CPUTime': 9999999}
        if self.vo:
            tqDict['VO'] = self.vo
        if self.voGroups:
            tqDict['OwnerGroup'] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result['OK']:
            return result
        tqDict['Platform'] = result['Value']
        tqDict['Site'] = self.sites
        tags = []
        for image in images:
            if 'Tags' in self.imageDict[image]['ParametersDict']:
                tags += self.imageDict[image]['ParametersDict']['Tags']
        tqDict['Tag'] = list(set(tags))
        tqDict['SubmitPool'] = "mpdPool"

        self.log.verbose('Checking overall TQ availability with requirements')
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result['OK']:
            return result
        if not result['Value']:
            self.log.verbose('No Waiting jobs suitable for the director')
            return S_OK()

        jobSites = set()
        anySite = False
        testSites = set()
        totalWaitingJobs = 0
        for tqID in result['Value']:
            if "Sites" in result['Value'][tqID]:
                for site in result['Value'][tqID]['Sites']:
                    if site.lower() != 'any':
                        jobSites.add(site)
                    else:
                        anySite = True
            else:
                anySite = True
            if "JobTypes" in result['Value'][tqID]:
                if "Sites" in result['Value'][tqID]:
                    for site in result['Value'][tqID]['Sites']:
                        if site.lower() != 'any':
                            testSites.add(site)
            totalWaitingJobs += result['Value'][tqID]['Jobs']

        tqIDList = result['Value'].keys()

        result = virtualMachineDB.getInstanceCounters('Status', {})
        totalVMs = 0
        if result['OK']:
            for status in result['Value']:
                if status in ['New', 'Submitted', 'Running']:
                    totalVMs += result['Value'][status]
        self.log.info('Total %d jobs in %d task queues with %d VMs' %
                      (totalWaitingJobs, len(tqIDList), totalVMs))

        # Check if the site is allowed in the mask
        result = jobDB.getSiteMask()
        if not result['OK']:
            return S_ERROR('Can not get the site mask')
        siteMaskList = result['Value']

        images = self.imageDict.keys()
        random.shuffle(images)
        totalSubmittedPilots = 0
        matchedQueues = 0
        for image in images:

            # Check if the image failed previously
            #failedCount = self.failedImages[ image ] % self.failedImageCycleFactor
            #if failedCount != 0:
            #  self.log.warn( "%s queue failed recently, skipping %d cycles" % ( image, 10-failedCount ) )
            #  self.failedImages[image] += 1
            #  continue

            #print "AT >>> image parameters:", image
            #for key,value in self.imageDict[image].items():
            #  print key,value

            ce = self.imageDict[image]['CE']
            ceName = self.imageDict[image]['CEName']
            imageName = self.imageDict[image]['ImageName']
            siteName = self.imageDict[image]['Site']
            platform = self.imageDict[image]['Platform']
            imageTags = self.imageDict[image]['ParametersDict'].get('Tags', [])
            siteMask = siteName in siteMaskList
            endpoint = "%s::%s" % (siteName, ceName)
            maxInstances = int(self.imageDict[image]['MaxInstances'])
            processorTags = []

            for tag in imageTags:
                if re.match(r'^[0-9]+Processors$', tag):
                    processorTags.append(tag)
            # vms support WholeNode naturally
            processorTags.append('WholeNode')

            if not anySite and siteName not in jobSites:
                self.log.verbose(
                    "Skipping queue %s at %s: no workload expected" %
                    (imageName, siteName))
                continue
            if not siteMask and siteName not in testSites:
                self.log.verbose("Skipping queue %s: site %s not in the mask" %
                                 (imageName, siteName))
                continue

            if 'CPUTime' in self.imageDict[image]['ParametersDict']:
                imageCPUTime = int(
                    self.imageDict[image]['ParametersDict']['CPUTime'])
            else:
                self.log.warn(
                    'CPU time limit is not specified for queue %s, skipping...'
                    % image)
                continue

            # Prepare the queue description to look for eligible jobs
            ceDict = ce.getParameterDict()

            if not siteMask:
                ceDict['JobType'] = "Test"
            if self.vo:
                ceDict['VO'] = self.vo
            if self.voGroups:
                ceDict['OwnerGroup'] = self.voGroups

            result = Resources.getCompatiblePlatforms(platform)
            if not result['OK']:
                continue
            ceDict['Platform'] = result['Value']

            ceDict['Tag'] = processorTags

            # Get the number of eligible jobs for the target site/queue

            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.verbose('No matching TQs found for %s' % image)
                continue

            matchedQueues += 1
            totalTQJobs = 0
            tqIDList = taskQueueDict.keys()
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]['Jobs']

            self.log.verbose(
                '%d job(s) from %d task queue(s) are eligible for %s queue' %
                (totalTQJobs, len(tqIDList), image))

            # Get the number of already instantiated VMs for these task queues
            totalWaitingVMs = 0
            result = virtualMachineDB.getInstanceCounters(
                'Status', {'Endpoint': endpoint})
            if result['OK']:
                for status in result['Value']:
                    if status in ['New', 'Submitted']:
                        totalWaitingVMs += result['Value'][status]
            if totalWaitingVMs >= totalTQJobs:
                self.log.verbose("%d VMs already for all the available jobs" %
                                 totalWaitingVMs)

            self.log.verbose(
                "%d VMs for the total of %d eligible jobs for %s" %
                (totalWaitingVMs, totalTQJobs, image))

            # Get the working proxy
            self.log.verbose("Getting cloud proxy for %s/%s" %
                             (self.cloudDN, self.cloudGroup))
            result = gProxyManager.getPilotProxyFromDIRACGroup(
                self.cloudDN, self.cloudGroup, 3600)
            if not result['OK']:
                return result
            self.proxy = result['Value']
            #ce.setProxy( self.proxy, cpuTime - 60 )

            # Get the number of available slots on the target site/endpoint
            totalSlots = self.getVMInstances(endpoint, maxInstances)
            if totalSlots == 0:
                self.log.debug('%s: No slots available' % image)
                continue

            vmsToSubmit = max(0, min(totalSlots,
                                     totalTQJobs - totalWaitingVMs))
            self.log.info( '%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d' % \
                                    ( image, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit ) )

            # Limit the number of VM instances to create to vmsToSubmit
            vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit)

            self.log.info('Going to submit %d VMs to %s queue' %
                          (vmsToSubmit, image))
            result = ce.createInstances(vmsToSubmit)
            #result = S_OK()
            if not result['OK']:
                self.log.error('Failed submission to queue %s:\n' % image,
                               result['Message'])
                self.failedImages.setdefault(image, 0)
                self.failedImages[image] += 1
                continue

            # Add VMs to the VirtualMachineDB
            vmDict = result['Value']
            totalSubmittedPilots += len(vmDict)
            self.log.info('Submitted %d VMs to %s@%s' %
                          (len(vmDict), imageName, ceName))

            pilotList = []
            for uuID in vmDict:
                diracUUID = vmDict[uuID]['InstanceID']
                endpoint = '%s::%s' % (self.imageDict[image]['Site'], ceName)
                result = virtualMachineDB.insertInstance(
                    uuID, imageName, diracUUID, endpoint, self.vo)
                if not result['OK']:
                    continue
                for ncpu in range(vmDict[uuID]['NumberOfCPUs']):
                    pRef = 'vm://' + ceName + '/' + diracUUID + ':' + str(
                        ncpu).zfill(2)
                    pilotList.append(pRef)

            stampDict = {}
            tqPriorityList = []
            sumPriority = 0.
            for tq in taskQueueDict:
                sumPriority += taskQueueDict[tq]['Priority']
                tqPriorityList.append((tq, sumPriority))
            tqDict = {}
            for pilotID in pilotList:
                rndm = random.random() * sumPriority
                for tq, prio in tqPriorityList:
                    if rndm < prio:
                        tqID = tq
                        break
                if not tqDict.has_key(tqID):
                    tqDict[tqID] = []
                tqDict[tqID].append(pilotID)

            for tqID, pilotList in tqDict.items():
                result = pilotAgentsDB.addPilotTQReference(
                    pilotList, tqID, '', '', self.localhost, 'Cloud',
                    stampDict)
                if not result['OK']:
                    self.log.error(
                        'Failed to insert pilots into the PilotAgentsDB')

        self.log.info(
            "%d VMs submitted in total in this cycle, %d matched queues" %
            (totalSubmittedPilots, matchedQueues))
        return S_OK()
Ejemplo n.º 30
0
  def getQueues( self, resourceDict ):
    """ Get the list of relevant CEs and their descriptions
    """

    self.queueDict = {}
    ceFactory = ComputingElementFactory()

    for site in resourceDict:
      for ce in resourceDict[site]:
        ceDict = resourceDict[site][ce]
        ceTags = ceDict.get( 'Tag' )
        if isinstance( ceTags, basestring ):
          ceTags = fromChar( ceTags )
        qDict = ceDict.pop( 'Queues' )
        for queue in qDict:
          queueName = '%s_%s' % ( ce, queue )
          self.queueDict[queueName] = {}
          self.queueDict[queueName]['ParametersDict'] = qDict[queue]
          self.queueDict[queueName]['ParametersDict']['Queue'] = queue
          self.queueDict[queueName]['ParametersDict']['Site'] = site
          self.queueDict[queueName]['ParametersDict']['GridEnv'] = self.gridEnv
          self.queueDict[queueName]['ParametersDict']['Setup'] = gConfig.getValue( '/DIRAC/Setup', 'unknown' )
          # Evaluate the CPU limit of the queue according to the Glue convention
          # To Do: should be a utility
          if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \
             "SI00" in self.queueDict[queueName]['ParametersDict']:
            maxCPUTime = float( self.queueDict[queueName]['ParametersDict']['maxCPUTime'] )
            # For some sites there are crazy values in the CS
            maxCPUTime = max( maxCPUTime, 0 )
            maxCPUTime = min( maxCPUTime, 86400 * 12.5 )
            si00 = float( self.queueDict[queueName]['ParametersDict']['SI00'] )
            queueCPUTime = 60. / 250. * maxCPUTime * si00
            self.queueDict[queueName]['ParametersDict']['CPUTime'] = int( queueCPUTime )
          queueTags = self.queueDict[queueName]['ParametersDict'].get( 'Tag' )
          if queueTags and isinstance( queueTags, basestring ):
            queueTags = fromChar( queueTags )
            self.queueDict[queueName]['ParametersDict']['Tag'] = queueTags
          if ceTags:
            if queueTags:
              allTags = list( set( ceTags + queueTags ) )
              self.queueDict[queueName]['ParametersDict']['Tag'] = allTags
            else:
              self.queueDict[queueName]['ParametersDict']['Tag'] = ceTags

          maxMemory = self.queueDict[queueName]['ParametersDict'].get( 'MaxRAM', None )
          if maxMemory:
            # MaxRAM value is supposed to be in MB
            maxMemoryList = range( 1, int( maxMemory )/1000 + 1 )
            memoryTags = [ '%dGB' % mem for mem in maxMemoryList ]
            if memoryTags:
              self.queueDict[queueName]['ParametersDict'].setdefault( 'Tag', [] )
              self.queueDict[queueName]['ParametersDict']['Tag'] += memoryTags
          qwDir = os.path.join( self.workingDirectory, queue )
          if not os.path.exists( qwDir ):
            os.makedirs( qwDir )
          self.queueDict[queueName]['ParametersDict']['WorkingDirectory'] = qwDir

          platform = ''
          if "Platform" in self.queueDict[queueName]['ParametersDict']:
            platform = self.queueDict[queueName]['ParametersDict']['Platform']
          elif "Platform" in ceDict:
            platform = ceDict['Platform']
          elif "OS" in ceDict:
            architecture = ceDict.get( 'architecture', 'x86_64' )
            OS = ceDict['OS']
            platform = '_'.join( [architecture, OS] )
          if platform and not platform in self.platforms:
            self.platforms.append( platform )

          if not "Platform" in self.queueDict[queueName]['ParametersDict'] and platform:
            result = Resources.getDIRACPlatform( platform )
            if result['OK']:
              self.queueDict[queueName]['ParametersDict']['Platform'] = result['Value'][0]

          ceQueueDict = dict( ceDict )
          ceQueueDict.update( self.queueDict[queueName]['ParametersDict'] )

          # Generate the CE object for the queue or pick the already existing one
          # if the queue definition did not change
          queueHash = self.__generateQueueHash( ceQueueDict )
          if queueName in self.queueCECache and self.queueCECache[queueName]['Hash'] == queueHash:
            queueCE = self.queueCECache[queueName]['CE']
          else:
            result = ceFactory.getCE( ceName = ce,
                                      ceType = ceDict['CEType'],
                                      ceParametersDict = ceQueueDict )
            if not result['OK']:
              return result
            self.queueCECache.setdefault( queueName, {} )
            self.queueCECache[queueName]['Hash'] = queueHash
            self.queueCECache[queueName]['CE'] = result['Value']
            queueCE = self.queueCECache[queueName]['CE']

          self.queueDict[queueName]['CE'] = queueCE
          self.queueDict[queueName]['CEName'] = ce
          self.queueDict[queueName]['CEType'] = ceDict['CEType']
          self.queueDict[queueName]['Site'] = site
          self.queueDict[queueName]['QueueName'] = queue
          self.queueDict[queueName]['Platform'] = platform
          result = self.queueDict[queueName]['CE'].isValid()
          if not result['OK']:
            self.log.fatal( result['Message'] )
            return result
          if 'BundleProxy' in self.queueDict[queueName]['ParametersDict']:
            if self.queueDict[queueName]['ParametersDict']['BundleProxy'].lower() in ['true','yes','1']:
              self.queueDict[queueName]['BundleProxy'] = True
          elif 'BundleProxy' in ceDict:
            if ceDict['BundleProxy'].lower() in ['true','yes','1']:
              self.queueDict[queueName]['BundleProxy'] = True

          if site not in self.sites:
            self.sites.append( site )

    return S_OK()
Ejemplo n.º 31
0
    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        queues = self.queueDict.keys()

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {
            'Setup': setup,
            'CPUTime': 9999999,
            'SubmitPool': self.defaultSubmitPools
        }
        if self.vo:
            tqDict['Community'] = self.vo
        if self.voGroups:
            tqDict['OwnerGroup'] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result['OK']:
            return result
        tqDict['Platform'] = result['Value']
        tqDict['Site'] = self.sites
        tags = []
        for queue in queues:
            tags += self.queueDict[queue]['ParametersDict']['Tag']
        tqDict['Tag'] = list(set(tags))

        self.log.verbose('Checking overall TQ availability with requirements')
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result['OK']:
            return result
        if not result['Value']:
            self.log.verbose('No Waiting jobs suitable for the director')
            return S_OK()

        jobSites = set()
        anySite = False
        testSites = set()
        totalWaitingJobs = 0
        for tqID in result['Value']:
            if "Sites" in result['Value'][tqID]:
                for site in result['Value'][tqID]['Sites']:
                    if site.lower() != 'any':
                        jobSites.add(site)
                    else:
                        anySite = True
            else:
                anySite = True
            if "JobTypes" in result['Value'][tqID]:
                if "Sites" in result['Value'][tqID]:
                    for site in result['Value'][tqID]['Sites']:
                        if site.lower() != 'any':
                            testSites.add(site)
            totalWaitingJobs += result['Value'][tqID]['Jobs']

        tqIDList = result['Value'].keys()
        self.log.info(tqIDList)
        result = pilotAgentsDB.countPilots(
            {
                'TaskQueueID': tqIDList,
                'Status': WAITING_PILOT_STATUS
            }, None)
        tagWaitingPilots = 0
        if result['OK']:
            tagWaitingPilots = result['Value']
        self.log.info(
            'Total %d jobs in %d task queues with %d waiting pilots' %
            (totalWaitingJobs, len(tqIDList), tagWaitingPilots))
        self.log.info('Queues: ', self.queueDict.keys())
        # if tagWaitingPilots >= totalWaitingJobs:
        #  self.log.info( 'No more pilots to be submitted in this cycle' )
        #  return S_OK()

        result = self.siteClient.getUsableSites()
        if not result['OK']:
            return result
        siteMaskList = result['Value']

        queues = self.queueDict.keys()
        random.shuffle(queues)
        totalSubmittedPilots = 0
        matchedQueues = 0
        for queue in queues:

            # Check if the queue failed previously
            failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor
            if failedCount != 0:
                self.log.warn("%s queue failed recently, skipping %d cycles" %
                              (queue, 10 - failedCount))
                self.failedQueues[queue] += 1
                continue

            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            ceType = self.queueDict[queue]['CEType']
            queueName = self.queueDict[queue]['QueueName']
            siteName = self.queueDict[queue]['Site']
            platform = self.queueDict[queue]['Platform']
            queueTags = self.queueDict[queue]['ParametersDict']['Tag']
            siteMask = siteName in siteMaskList
            processorTags = []

            # Check the status of the Site
            result = self.siteClient.getUsableSites(siteName)
            if not result['OK']:
                self.log.error("Can not get the status of site %s: %s" %
                               (siteName, result['Message']))
                continue
            if siteName not in result.get('Value', []):
                self.log.info("site %s is not active" % siteName)
                continue

            if self.rssFlag:
                # Check the status of the ComputingElement
                result = self.rssClient.getElementStatus(
                    ceName, "ComputingElement")
                if not result['OK']:
                    self.log.error(
                        "Can not get the status of computing element",
                        " %s: %s" % (siteName, result['Message']))
                    continue
                if result['Value']:
                    # get the value of the status
                    result = result['Value'][ceName]['all']

                if result not in ('Active', 'Degraded'):
                    self.log.verbose(
                        "Skipping computing element %s at %s: resource not usable"
                        % (ceName, siteName))
                    continue

            for tag in queueTags:
                if re.match(r'^[0-9]+Processors$', tag):
                    processorTags.append(tag)
            if 'WholeNode' in queueTags:
                processorTags.append('WholeNode')

            if not anySite and siteName not in jobSites:
                self.log.verbose(
                    "Skipping queue %s at %s: no workload expected" %
                    (queueName, siteName))
                continue
            if not siteMask and siteName not in testSites:
                self.log.verbose(
                    "Skipping queue %s at site %s not in the mask" %
                    (queueName, siteName))
                continue

            if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
                queueCPUTime = int(
                    self.queueDict[queue]['ParametersDict']['CPUTime'])
            else:
                self.log.warn(
                    'CPU time limit is not specified for queue %s, skipping...'
                    % queue)
                continue
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Prepare the queue description to look for eligible jobs
            ceDict = ce.getParameterDict()
            ceDict['GridCE'] = ceName
            # if not siteMask and 'Site' in ceDict:
            #  self.log.info( 'Site not in the mask %s' % siteName )
            #  self.log.info( 'Removing "Site" from matching Dict' )
            #  del ceDict[ 'Site' ]
            if not siteMask:
                ceDict['JobType'] = "Test"
            if self.vo:
                ceDict['Community'] = self.vo
            if self.voGroups:
                ceDict['OwnerGroup'] = self.voGroups

            # This is a hack to get rid of !
            ceDict['SubmitPool'] = self.defaultSubmitPools

            result = Resources.getCompatiblePlatforms(platform)
            if not result['OK']:
                continue
            ceDict['Platform'] = result['Value']

            ceDict['Tag'] = queueTags
            # Get the number of eligible jobs for the target site/queue
            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.verbose('No matching TQs found for %s' % queue)
                continue

            matchedQueues += 1
            totalTQJobs = 0
            totalTQJobsByProcessors = {}
            tqIDList = taskQueueDict.keys()
            tqIDListByProcessors = {}
            for tq in taskQueueDict:
                if 'Tags' not in taskQueueDict[tq]:
                    # skip non multiprocessor tqs
                    continue
                for tag in taskQueueDict[tq]['Tags']:
                    if tag in processorTags:
                        tqIDListByProcessors.setdefault(tag, [])
                        tqIDListByProcessors[tag].append(tq)

                        totalTQJobsByProcessors.setdefault(tag, 0)
                        totalTQJobsByProcessors[tag] += taskQueueDict[tq][
                            'Jobs']

                totalTQJobs += taskQueueDict[tq]['Jobs']

            self.log.verbose(
                '%d job(s) from %d task queue(s) are eligible for %s queue' %
                (totalTQJobs, len(tqIDList), queue))

            queueSubmittedPilots = 0
            for tag in tqIDListByProcessors:

                self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" %
                                 (tag, tqIDListByProcessors[tag]))

                processors = 1

                m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag)
                if m:
                    processors = int(m.group('processors'))
                if tag == 'WholeNode':
                    processors = -1

                tagTQJobs = totalTQJobsByProcessors[tag]
                tagTqIDList = tqIDListByProcessors[tag]

                # Get the number of already waiting pilots for these task queues
                tagWaitingPilots = 0
                if self.pilotWaitingFlag:
                    lastUpdateTime = dateTime(
                    ) - self.pilotWaitingTime * second
                    result = pilotAgentsDB.countPilots(
                        {
                            'TaskQueueID': tagTqIDList,
                            'Status': WAITING_PILOT_STATUS
                        }, None, lastUpdateTime)
                    if not result['OK']:
                        self.log.error(
                            'Failed to get Number of Waiting pilots',
                            result['Message'])
                        tagWaitingPilots = 0
                    else:
                        tagWaitingPilots = result['Value']
                        self.log.verbose(
                            'Waiting Pilots for TaskQueue %s:' % tagTqIDList,
                            tagWaitingPilots)
                if tagWaitingPilots >= tagTQJobs:
                    self.log.verbose(
                        "%d waiting pilots already for all the available jobs"
                        % tagWaitingPilots)
                    continue

                self.log.verbose(
                    "%d waiting pilots for the total of %d eligible jobs for %s"
                    % (tagWaitingPilots, tagTQJobs, queue))

                # Get the working proxy
                cpuTime = queueCPUTime + 86400
                self.log.verbose("Getting pilot proxy for %s/%s %d long" %
                                 (self.pilotDN, self.pilotGroup, cpuTime))
                result = gProxyManager.getPilotProxyFromDIRACGroup(
                    self.pilotDN, self.pilotGroup, cpuTime)
                if not result['OK']:
                    return result
                self.proxy = result['Value']
                ce.setProxy(self.proxy, cpuTime - 60)

                # Get the number of available slots on the target site/queue
                totalSlots = self.getQueueSlots(queue, False)
                if totalSlots == 0:
                    self.log.debug('%s: No slots available' % queue)
                    continue

                # Note: comparing slots to job numbers is not accurate in multiprocessor case.
                #       This could lead to over submission.
                pilotsToSubmit = max(
                    0, min(totalSlots, tagTQJobs - tagWaitingPilots))
                self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \
                               ( queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit ) )

                # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
                pilotsToSubmit = min(
                    self.maxPilotsToSubmit - queueSubmittedPilots,
                    pilotsToSubmit)

                while pilotsToSubmit > 0:
                    self.log.info('Going to submit %d pilots to %s queue' %
                                  (pilotsToSubmit, queue))

                    bundleProxy = self.queueDict[queue].get(
                        'BundleProxy', False)
                    jobExecDir = ''
                    jobExecDir = self.queueDict[queue]['ParametersDict'].get(
                        'JobExecDir', jobExecDir)
                    httpProxy = self.queueDict[queue]['ParametersDict'].get(
                        'HttpProxy', '')

                    result = self.getExecutable(queue, pilotsToSubmit,
                                                bundleProxy, httpProxy,
                                                jobExecDir)
                    if not result['OK']:
                        return result

                    executable, pilotSubmissionChunk = result['Value']
                    result = ce.submitJob(executable,
                                          '',
                                          pilotSubmissionChunk,
                                          processors=processors)
                    # ## FIXME: The condor thing only transfers the file with some
                    # ## delay, so when we unlink here the script is gone
                    # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts...
                    if ceType != 'HTCondorCE':
                        os.unlink(executable)
                    if not result['OK']:
                        self.log.error(
                            'Failed submission to queue %s:\n' % queue,
                            result['Message'])
                        pilotsToSubmit = 0
                        self.failedQueues[queue] += 1
                        continue

                    pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
                    queueSubmittedPilots += pilotSubmissionChunk
                    # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                    # task queue priorities
                    pilotList = result['Value']
                    self.queueSlots[queue]['AvailableSlots'] -= len(pilotList)
                    totalSubmittedPilots += len(pilotList)
                    self.log.info('Submitted %d pilots to %s@%s' %
                                  (len(pilotList), queueName, ceName))
                    stampDict = {}
                    if result.has_key('PilotStampDict'):
                        stampDict = result['PilotStampDict']
                    tqPriorityList = []
                    sumPriority = 0.
                    for tq in tagTqIDList:
                        sumPriority += taskQueueDict[tq]['Priority']
                        tqPriorityList.append((tq, sumPriority))
                    rndm = random.random() * sumPriority
                    tqDict = {}
                    for pilotID in pilotList:
                        rndm = random.random() * sumPriority
                        for tq, prio in tqPriorityList:
                            if rndm < prio:
                                tqID = tq
                                break
                        if not tqDict.has_key(tqID):
                            tqDict[tqID] = []
                        tqDict[tqID].append(pilotID)

                    for tqID, pilotList in tqDict.items():
                        result = pilotAgentsDB.addPilotTQReference(
                            pilotList, tqID, self.pilotDN, self.pilotGroup,
                            self.localhost, ceType, '', stampDict)
                        if not result['OK']:
                            self.log.error(
                                'Failed add pilots to the PilotAgentsDB: ',
                                result['Message'])
                            continue
                        for pilot in pilotList:
                            result = pilotAgentsDB.setPilotStatus(
                                pilot, 'Submitted', ceName,
                                'Successfully submitted by the SiteDirector',
                                siteName, queueName)
                            if not result['OK']:
                                self.log.error('Failed to set pilot status: ',
                                               result['Message'])
                                continue

        self.log.info(
            "%d pilots submitted in total in this cycle, %d matched queues" %
            (totalSubmittedPilots, matchedQueues))
        return S_OK()
Ejemplo n.º 32
0
  def beginExecution( self ):

    self.gridEnv = self.am_getOption( "GridEnv", getGridEnv() )
    # The SiteDirector is for a particular user community
    self.vo = self.am_getOption( "VO", '' )
    if not self.vo:
      self.vo = self.am_getOption( "Community", '' )
    if not self.vo:
      self.vo = CSGlobals.getVO()
    # The SiteDirector is for a particular user group
    self.group = self.am_getOption( "Group", '' )
    # self.voGroups contain all the eligible user groups for pilots submutted by this SiteDirector
    self.voGroups = []

    # Choose the group for which pilots will be submitted. This is a hack until
    # we will be able to match pilots to VOs.
    if not self.group:
      if self.vo:
        result = Registry.getGroupsForVO( self.vo )
        if not result['OK']:
          return result
        for group in result['Value']:
          if 'NormalUser' in Registry.getPropertiesForGroup( group ):
            self.voGroups.append( group )
    else:
      self.voGroups = [ self.group ]

    result = findGenericPilotCredentials( vo = self.vo )
    if not result[ 'OK' ]:
      return result
    self.pilotDN, self.pilotGroup = result[ 'Value' ]
    self.pilotDN = self.am_getOption( "PilotDN", self.pilotDN )
    self.pilotGroup = self.am_getOption( "PilotGroup", self.pilotGroup )

    self.platforms = []
    self.sites = []
    self.defaultSubmitPools = ''
    if self.group:
      self.defaultSubmitPools = Registry.getGroupOption( self.group, 'SubmitPools', '' )
    elif self.vo:
      self.defaultSubmitPools = Registry.getVOOption( self.vo, 'SubmitPools', '' )

    self.pilot = self.am_getOption( 'PilotScript', DIRAC_PILOT )
    self.install = DIRAC_INSTALL
    self.extraModules = self.am_getOption( 'ExtraPilotModules', [] ) + DIRAC_MODULES
    self.workingDirectory = self.am_getOption( 'WorkDirectory' )
    self.maxQueueLength = self.am_getOption( 'MaxQueueLength', 86400 * 3 )
    self.pilotLogLevel = self.am_getOption( 'PilotLogLevel', 'INFO' )
    self.maxJobsInFillMode = self.am_getOption( 'MaxJobsInFillMode', self.maxJobsInFillMode )
    self.maxPilotsToSubmit = self.am_getOption( 'MaxPilotsToSubmit', self.maxPilotsToSubmit )
    self.pilotWaitingFlag = self.am_getOption( 'PilotWaitingFlag', True )
    self.pilotWaitingTime = self.am_getOption( 'MaxPilotWaitingTime', 3600 )
    self.failedQueueCycleFactor = self.am_getOption( 'FailedQueueCycleFactor', 10 )
    self.pilotStatusUpdateCycleFactor = self.am_getOption( 'PilotStatusUpdateCycleFactor', 10 ) 

    # Flags
    self.updateStatus = self.am_getOption( 'UpdatePilotStatus', True )
    self.getOutput = self.am_getOption( 'GetPilotOutput', True )
    self.sendAccounting = self.am_getOption( 'SendPilotAccounting', True )

    # Get the site description dictionary
    siteNames = None
    if not self.am_getOption( 'Site', 'Any' ).lower() == "any":
      siteNames = self.am_getOption( 'Site', [] )
      if not siteNames:
        siteNames = None
    ceTypes = None
    if not self.am_getOption( 'CETypes', 'Any' ).lower() == "any":
      ceTypes = self.am_getOption( 'CETypes', [] )
    ces = None
    if not self.am_getOption( 'CEs', 'Any' ).lower() == "any":
      ces = self.am_getOption( 'CEs', [] )
      if not ces:
        ces = None
    result = Resources.getQueues( community = self.vo,
                                  siteList = siteNames,
                                  ceList = ces,
                                  ceTypeList = ceTypes,
                                  mode = 'Direct' )
    if not result['OK']:
      return result
    resourceDict = result['Value']
    result = self.getQueues( resourceDict )
    if not result['OK']:
      return result

    #if not siteNames:
    #  siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' )
    #  if siteName == 'Unknown':
    #    return S_OK( 'No site specified for the SiteDirector' )
    #  else:
    #    siteNames = [siteName]
    #self.siteNames = siteNames

    if self.updateStatus:
      self.log.always( 'Pilot status update requested' )
    if self.getOutput:
      self.log.always( 'Pilot output retrieval requested' )
    if self.sendAccounting:
      self.log.always( 'Pilot accounting sending requested' )

    self.log.always( 'Sites:', siteNames )
    self.log.always( 'CETypes:', ceTypes )
    self.log.always( 'CEs:', ces )
    self.log.always( 'PilotDN:', self.pilotDN )
    self.log.always( 'PilotGroup:', self.pilotGroup )
    self.log.always( 'MaxPilotsToSubmit:', self.maxPilotsToSubmit )
    self.log.always( 'MaxJobsInFillMode:', self.maxJobsInFillMode )

    self.localhost = socket.getfqdn()
    self.proxy = ''

    if self.firstPass:
      if self.queueDict:
        self.log.always( "Agent will serve queues:" )
        for queue in self.queueDict:
          self.log.always( "Site: %s, CE: %s, Queue: %s" % ( self.queueDict[queue]['Site'],
                                                           self.queueDict[queue]['CEName'],
                                                           queue ) )
    self.firstPass = False
    return S_OK()
Ejemplo n.º 33
0
  def doCommand( self ):
    """
#    Returns simple pilots efficiency
#
#    :attr:`args`:
#        - args[0]: string - should be a ValidElement
#
#        - args[1]: string - should be the name of the ValidElement
#
#    returns:
#      {
#        'Result': 'Good'|'Fair'|'Poor'|'Idle'|'Bad'
#      }
    """

    if not 'element' in self.args:
      return self.returnERROR( S_ERROR( 'element is missing' ) )
    element = self.args[ 'element' ]    
   
    if not 'siteName' in self.args:
      return self.returnERROR( S_ERROR( 'siteName is missing' ) )
    siteName = self.args[ 'siteName' ]  
    
    # If siteName is None, we take all sites
    if siteName is None:
      siteName = Resources.getSites()      
      if not siteName[ 'OK' ]:
        return self.returnERROR( siteName )
      siteName = siteName[ 'Value' ]

    if element == 'Site':
      results = self.wmsAdmin.getPilotSummaryWeb( { 'GridSite' : siteName }, [], 0, 300 )
    elif element == 'Resource':
      results = self.wmsAdmin.getPilotSummaryWeb( { 'ExpandSite' : siteName }, [], 0, 300 )      
    else:
      return self.returnERROR( S_ERROR( '%s is a wrong element' % element ) )  
       
    if not results[ 'OK' ]:
      return self.returnERROR( results )
    results = results[ 'Value' ]
    
    if not 'ParameterNames' in results:
      return self.returnERROR( S_ERROR( 'Malformed result dictionary' ) )
    params = results[ 'ParameterNames' ]
    
    if not 'Records' in results:
      return self.returnERROR( S_ERROR( 'Malformed result dictionary' ) )
    records = results[ 'Records' ]
    
    pilotResults = [] 
       
    for record in records:
      
      pilotDict = dict( zip( params , record ))
      try:
        pilotDict[ 'PilotsPerJob' ] = float( pilotDict[ 'PilotsPerJob' ] )
        pilotDict[ 'PilotsJobEff' ] = float( pilotDict[ 'PilotsJobEff' ] )
      except KeyError, e:
        return self.returnERROR( S_ERROR( e ) ) 
      except ValueError, e:
        return self.returnERROR( S_ERROR( e ) )