Beispiel #1
0
  def execute( self, dataToResolve = None ):
    """This method is called to obtain the TURLs for all requested input data
       firstly by available site protocols and redundantly via TURL construction.
       If TURLs are missing these are conveyed in the result to
    """

    # Define local configuration options present at every site
    localSEList = self.configuration['LocalSEList']
    self.jobID = self.configuration.get( 'JobID' )
    allReplicas = self.configuration.get( 'AllReplicas', False )
    if allReplicas:
      self.log.info( 'All replicas will be used in the resolution' )

    if dataToResolve:
      self.log.verbose( 'Data to resolve passed directly to InputDataByProtocol module' )
      self.inputData = dataToResolve  # e.g. list supplied by another module

    self.inputData = [x.replace( 'LFN:', '' ) for x in self.inputData]
    self.log.verbose( 'InputData requirement to be resolved by protocol is:\n%s' % '\n'.join( self.inputData ) )

    # First make a check in case replicas have been removed or are not accessible
    # from the local site (remove these from consideration for local protocols)
    replicas = self.fileCatalogResult['Value']['Successful']
    self.log.debug( 'File Catalogue result is:\n%s' % str( replicas ) )

    # First get the preferred replica:
    result = self.__resolveReplicas( localSEList, replicas )
    if not result['OK']:
      return result
    success = result['Successful']
    if not allReplicas:
      bestReplica = {}
      for lfn in success:
        bestReplica[lfn] = success[lfn][0]
      ret = S_OK()
      ret.update( {'Successful': bestReplica, 'Failed':result['Failed']} )
      return ret

    # If all replicas are requested, get results for other SEs
    seList = set()
    localSESet = set( localSEList )
    for lfn in replicas.keys():
      extraSEs = set( replicas[lfn] ) - localSESet
      # If any extra SE, add it to the set, othewise don't consider that file
      if extraSEs:
        seList.update( extraSEs )
      else:
        replicas.pop( lfn )
    seList -= self.metaKeys

    if seList:
      result = self.__resolveReplicas( seList, replicas, ignoreTape = True )
      if not result['OK']:
        return result
      for lfn in result['Successful']:
        success.setdefault( lfn, [] ).extend( result['Successful'][lfn] )
    # Only consider failed the files that are not successful as well
    failed = [lfn for lfn in result['Failed'] if lfn not in success]
    return S_OK( {'Successful': success, 'Failed':failed} )
Beispiel #2
0
  def available(self, jobIDList=None):
    """This method returns the number of available slots in the target CE. The CE
       instance polls for waiting and running jobs and compares to the limits
       in the CE parameters.

       :param jobIDList: list of already existing job IDs to be checked against
       :type jobIDList: python:list
    """

    # If there are no already registered jobs
    if jobIDList is not None and not jobIDList:
      result = S_OK()
      result['RunningJobs'] = 0
      result['WaitingJobs'] = 0
      result['SubmittedJobs'] = 0
    else:
      result = self.ceParameters.get('CEType')
      if result and result == 'CREAM':
        result = self.getCEStatus(jobIDList)
      else:
        result = self.getCEStatus()
      if not result['OK']:
        return result
    runningJobs = result['RunningJobs']
    waitingJobs = result['WaitingJobs']
    submittedJobs = result['SubmittedJobs']
    availableProcessors = result.get('AvailableProcessors')
    ceInfoDict = dict(result)

    maxTotalJobs = int(self.ceParameters.get('MaxTotalJobs', 0))
    ceInfoDict['MaxTotalJobs'] = maxTotalJobs
    waitingToRunningRatio = float(self.ceParameters.get('WaitingToRunningRatio', 0.0))
    # if there are no Running job we can submit to get at most 'MaxWaitingJobs'
    # if there are Running jobs we can increase this to get a ratio W / R 'WaitingToRunningRatio'
    maxWaitingJobs = int(max(int(self.ceParameters.get('MaxWaitingJobs', 0)),
                             runningJobs * waitingToRunningRatio))

    self.log.verbose('Max Number of Jobs:', maxTotalJobs)
    self.log.verbose('Max W/R Ratio:', waitingToRunningRatio)
    self.log.verbose('Max Waiting Jobs:', maxWaitingJobs)

    # Determine how many more jobs can be submitted
    message = '%s CE: SubmittedJobs=%s' % (self.ceName, submittedJobs)
    message += ', WaitingJobs=%s, RunningJobs=%s' % (waitingJobs, runningJobs)
    totalJobs = runningJobs + waitingJobs

    message += ', MaxTotalJobs=%s' % (maxTotalJobs)

    if totalJobs >= maxTotalJobs:
      self.log.verbose('Max Number of Jobs reached:', maxTotalJobs)
      result['Value'] = 0
      message = 'There are %s waiting jobs and total jobs %s >= %s max total jobs' % (
          waitingJobs, totalJobs, maxTotalJobs)
    else:
      additionalJobs = 0
      if waitingJobs < maxWaitingJobs:
        additionalJobs = maxWaitingJobs - waitingJobs
        if totalJobs + additionalJobs >= maxTotalJobs:
          additionalJobs = maxTotalJobs - totalJobs
      # For SSH CE case
      if int(self.ceParameters.get('MaxWaitingJobs', 0)) == 0:
        additionalJobs = maxTotalJobs - runningJobs

      if availableProcessors is not None:
        additionalJobs = min(additionalJobs, availableProcessors)
      result['Value'] = additionalJobs

    result['Message'] = message
    result['CEInfoDict'] = ceInfoDict
    return result
Beispiel #3
0
  def storeProxy( self, userDN, userGroup, chain ):
    """ Store user proxy into the Proxy repository for a user specified by his
        DN and group.
    """
    retVal = Registry.getUsernameForDN( userDN )
    if not retVal[ 'OK' ]:
      return retVal
    userName = retVal[ 'Value' ]
    #Get remaining secs
    retVal = chain.getRemainingSecs()
    if not retVal[ 'OK' ]:
      return retVal
    remainingSecs = retVal[ 'Value' ]
    if remainingSecs < self._minSecsToAllowStore:
      return S_ERROR( "Cannot store proxy, remaining secs %s is less than %s" % ( remainingSecs, self._minSecsToAllowStore ) )

    #Compare the DNs
    retVal = chain.getIssuerCert()
    if not retVal[ 'OK' ]:
      return retVal
    proxyIdentityDN = retVal[ 'Value' ].getSubjectDN()[ 'Value' ]
    if not userDN == proxyIdentityDN:
      msg = "Mismatch in the user DN"
      vMsg = "Proxy says %s and credentials are %s" % ( proxyIdentityDN, userDN )
      self.log.error( msg, vMsg )
      return S_ERROR( "%s. %s" % ( msg, vMsg ) )
    #Check the groups
    retVal = chain.getDIRACGroup()
    if not retVal[ 'OK' ]:
      return retVal
    proxyGroup = retVal[ 'Value' ]
    if not proxyGroup:
      proxyGroup = Registry.getDefaultUserGroup()
    if not userGroup == proxyGroup:
      msg = "Mismatch in the user group"
      vMsg = "Proxy says %s and credentials are %s" % ( proxyGroup, userGroup )
      self.log.error( msg, vMsg )
      return S_ERROR( "%s. %s" % ( msg, vMsg ) )
    #Check if its limited
    if chain.isLimitedProxy()['Value']:
      return S_ERROR( "Limited proxies are not allowed to be stored" )
    dLeft = remainingSecs / 86400
    hLeft = remainingSecs / 3600 - dLeft * 24
    mLeft = remainingSecs / 60 - hLeft * 60 - dLeft * 1440
    sLeft = remainingSecs - hLeft * 3600 - mLeft * 60 - dLeft * 86400
    self.log.info( "Storing proxy for credentials %s (%d:%02d:%02d:%02d left)" % ( proxyIdentityDN, dLeft, hLeft, mLeft, sLeft ) )

    try:
      sUserDN = self._escapeString( userDN )[ 'Value' ]
      sUserGroup = self._escapeString( userGroup )[ 'Value' ]
    except KeyError:
      return S_ERROR( "Cannot escape DN" )
    # Check what we have already got in the repository
    cmd = "SELECT TIMESTAMPDIFF( SECOND, UTC_TIMESTAMP(), ExpirationTime ), Pem FROM `ProxyDB_Proxies` WHERE UserDN=%s AND UserGroup=%s" % ( sUserDN, sUserGroup )
    result = self._query( cmd )
    if not result['OK']:
      return result
    # check if there is a previous ticket for the DN
    data = result[ 'Value' ]
    sqlInsert = True
    if len( data ) > 0:
      sqlInsert = False
      pem = data[0][1]
      if pem:
        remainingSecsInDB = data[0][0]
        if remainingSecs <= remainingSecsInDB:
          self.log.info( "Proxy stored is longer than uploaded, omitting.", "%s in uploaded, %s in db" % ( remainingSecs, remainingSecsInDB ) )
          return S_OK()

    pemChain = chain.dumpAllToString()['Value']
    dValues = { 'UserName' : self._escapeString( userName )[ 'Value' ],
                'UserDN' : sUserDN,
                'UserGroup' : sUserGroup,
                'Pem' : self._escapeString( pemChain )[ 'Value' ],
                'ExpirationTime' : 'TIMESTAMPADD( SECOND, %d, UTC_TIMESTAMP() )' % int( remainingSecs ),
                'PersistentFlag' : "'False'" }
    if sqlInsert:
      sqlFields = []
      sqlValues = []
      for key in dValues:
        sqlFields.append( key )
        sqlValues.append( dValues[ key ] )
      cmd = "INSERT INTO `ProxyDB_Proxies` ( %s ) VALUES ( %s )" % ( ", ".join( sqlFields ), ", ".join( sqlValues ) )
    else:
      sqlSet = []
      sqlWhere = []
      for k in dValues:
        if k in ( 'UserDN', 'UserGroup' ):
          sqlWhere.append( "%s = %s" % ( k, dValues[k] ) )
        else:
          sqlSet.append( "%s = %s" % ( k, dValues[k] ) )
      cmd = "UPDATE `ProxyDB_Proxies` SET %s WHERE %s" % ( ", ".join( sqlSet ), " AND ".join( sqlWhere ) )

    self.logAction( "store proxy", userDN, userGroup, userDN, userGroup )
    return self._update( cmd )
Beispiel #4
0
    def __uploadInputSandbox(self, classAdJob, jobDescriptionObject=None):
        """Checks the validity of the job Input Sandbox.
       The function returns the list of Input Sandbox files.
       The total volume of the input sandbox is evaluated
    """
        inputSandbox = self.__getInputSandboxEntries(classAdJob)

        realFiles = []
        badFiles = []
        diskFiles = []

        for isFile in inputSandbox:
            if not isFile.startswith(('lfn:', 'LFN:', 'SB:', '%s', '%(')):
                realFiles.append(isFile)

        stringIOFiles = []
        stringIOFilesSize = 0
        if jobDescriptionObject is not None:
            if isinstance(jobDescriptionObject, StringIO):
                stringIOFiles = [jobDescriptionObject]
                stringIOFilesSize = len(jobDescriptionObject.getvalue())
                gLogger.debug("Size of the stringIOFiles: " +
                              str(stringIOFilesSize))
            else:
                return S_ERROR(
                    EWMSJDL, "jobDescriptionObject is not a StringIO object")

        # Check real files
        for isFile in realFiles:
            if not os.path.exists(
                    isFile
            ):  # we are passing in real files, we expect them to be on disk
                badFiles.append(isFile)
                gLogger.warn("inputSandbox file/directory " + isFile +
                             " not found. Keep looking for the others")
                continue
            diskFiles.append(isFile)

        diskFilesSize = File.getGlobbedTotalSize(diskFiles)
        gLogger.debug("Size of the diskFiles: " + str(diskFilesSize))
        totalSize = diskFilesSize + stringIOFilesSize
        gLogger.verbose("Total size of the inputSandbox: " + str(totalSize))

        okFiles = stringIOFiles + diskFiles
        if badFiles:
            result = S_ERROR(EWMSJDL, 'Input Sandbox is not valid')
            result['BadFile'] = badFiles
            result['TotalSize'] = totalSize
            return result

        if okFiles:
            if not self.sandboxClient:
                self.sandboxClient = SandboxStoreClient(
                    useCertificates=self.useCertificates,
                    delegatedDN=self.delegatedDN,
                    delegatedGroup=self.delegatedGroup)
            result = self.sandboxClient.uploadFilesAsSandbox(okFiles)
            if not result['OK']:
                return result
            inputSandbox.append(result['Value'])
            classAdJob.insertAttributeVectorString("InputSandbox",
                                                   inputSandbox)

        return S_OK()
Beispiel #5
0
def getQueues(siteList=None,
              ceList=None,
              ceTypeList=None,
              community=None,
              mode=None):
    """ Get CE/queue options according to the specified selection
  """

    result = gConfig.getSections('/Resources/Sites')
    if not result['OK']:
        return result

    resultDict = {}

    grids = result['Value']
    for grid in grids:
        result = gConfig.getSections('/Resources/Sites/%s' % grid)
        if not result['OK']:
            continue
        sites = result['Value']
        for site in sites:
            if siteList is not None and not site in siteList:
                continue
            if community:
                comList = gConfig.getValue(
                    '/Resources/Sites/%s/%s/VO' % (grid, site), [])
                if comList and not community in comList:
                    continue
            siteCEParameters = {}
            result = gConfig.getOptionsDict('/Resources/Sites/%s/%s/CEs' %
                                            (grid, site))
            if result['OK']:
                siteCEParameters = result['Value']
            result = gConfig.getSections('/Resources/Sites/%s/%s/CEs' %
                                         (grid, site))
            if not result['OK']:
                continue
            ces = result['Value']
            for ce in ces:
                if mode:
                    ceMode = gConfig.getValue(
                        '/Resources/Sites/%s/%s/CEs/%s/SubmissionMode' %
                        (grid, site, ce), 'Direct')
                    if not ceMode or ceMode != mode:
                        continue
                if ceTypeList:
                    ceType = gConfig.getValue(
                        '/Resources/Sites/%s/%s/CEs/%s/CEType' %
                        (grid, site, ce), '')
                    if not ceType or not ceType in ceTypeList:
                        continue
                if ceList is not None and not ce in ceList:
                    continue
                if community:
                    comList = gConfig.getValue(
                        '/Resources/Sites/%s/%s/CEs/%s/VO' % (grid, site, ce),
                        [])
                    if comList and not community in comList:
                        continue
                ceOptionsDict = dict(siteCEParameters)
                result = gConfig.getOptionsDict(
                    '/Resources/Sites/%s/%s/CEs/%s' % (grid, site, ce))
                if not result['OK']:
                    continue
                ceOptionsDict.update(result['Value'])
                result = gConfig.getSections(
                    '/Resources/Sites/%s/%s/CEs/%s/Queues' % (grid, site, ce))
                if not result['OK']:
                    continue
                queues = result['Value']
                for queue in queues:
                    if community:
                        comList = gConfig.getValue(
                            '/Resources/Sites/%s/%s/CEs/%s/Queues/%s/VO' %
                            (grid, site, ce, queue), [])
                        if comList and not community in comList:
                            continue
                    resultDict.setdefault(site, {})
                    resultDict[site].setdefault(ce, ceOptionsDict)
                    resultDict[site][ce].setdefault('Queues', {})
                    result = gConfig.getOptionsDict(
                        '/Resources/Sites/%s/%s/CEs/%s/Queues/%s' %
                        (grid, site, ce, queue))
                    if not result['OK']:
                        continue
                    queueOptionsDict = result['Value']
                    resultDict[site][ce]['Queues'][queue] = queueOptionsDict

    return S_OK(resultDict)
Beispiel #6
0
    def physicalRemoval(self, index, requestObj, subRequestAttrs,
                        subRequestFiles):
        """ action for 'physicalRemoval' operation

    :param self: self reference
    :param int index: subRequest index in execution order
    :param RequestContainer requestObj: request
    :param dict subRequestAttrs: subRequest's attributes
    :param dict subRequestFiles: subRequest's files
    """
        self.info("physicalRemoval: processing subrequest %s" % index)
        if requestObj.isSubRequestEmpty(index, "removal")["Value"]:
            self.info(
                "physicalRemoval: subrequest %s is empty, setting its status to 'Done'"
                % index)
            requestObj.setSubRequestStatus(index, "removal", "Done")
            return S_OK(requestObj)

        targetSEs = list(
            set([
                targetSE.strip()
                for targetSE in subRequestAttrs["TargetSE"].split(",")
                if targetSE.strip()
            ]))
        pfns = []
        pfnToLfn = {}
        for subRequestFile in subRequestFiles:
            if subRequestFile["Status"] == "Waiting":
                pfn = subRequestFile["PFN"]
                lfn = subRequestFile["LFN"]
                pfnToLfn[pfn] = lfn
                pfns.append(pfn)
        failed = {}
        errors = {}
        self.addMark('PhysicalRemovalAtt', len(pfns))
        for targetSE in targetSEs:
            remove = self.replicaManager().removeStorageFile(pfns, targetSE)
            if remove["OK"]:
                for pfn in remove["Value"]["Failed"]:
                    if pfn not in failed:
                        failed[pfn] = {}
                    failed[pfn][targetSE] = remove["Value"]["Failed"][pfn]
            else:
                errors[targetSE] = remove["Message"]
                for pfn in pfns:
                    if pfn not in failed:
                        failed[pfn] = {}
                    failed[pfn][targetSE] = "Completely"
        failedPFNs = failed.keys()
        pfnsOK = [pfn for pfn in pfns if pfn not in failedPFNs]
        self.addMark("PhysicalRemovalDone", len(pfnsOK))
        for pfn in pfnsOK:
            self.info("physicalRemoval: succesfully removed %s from %s" %
                      (pfn, str(targetSEs)))
            res = requestObj.setSubRequestFileAttributeValue(
                index, "removal", pfnToLfn[pfn], "Status", "Done")
            if not res["OK"]:
                self.error(
                    "physicalRemoval: error setting status to 'Done' for %s" %
                    pfnToLfn[pfn])

        if failed:
            self.addMark("PhysicalRemovalFail", len(failedPFNs))
            for pfn in failed:
                for targetSE in failed[pfn]:
                    if type(failed[pfn][targetSE]) in StringTypes:
                        if re.search("no such file or directory",
                                     failed[pfn][targetSE].lower()):
                            self.info(
                                "physicalRemoval: file %s did not exist" % pfn)
                            res = requestObj.setSubRequestFileAttributeValue(
                                index, "removal", pfnToLfn[pfn], "Status",
                                "Done")
                            if not res["OK"]:
                                self.error(
                                    "physicalRemoval: error setting status to 'Done' for %s"
                                    % pfnToLfn[pfn])

        if errors:
            for targetSE in errors:
                self.warn(
                    "physicalRemoval: completely failed to remove files at %s"
                    % targetSE)

        # # subrequest empty or all Files done?
        if requestObj.isSubRequestDone(index, "removal")["Value"]:
            self.info(
                "physicalRemoval: all files processed, setting subrequest status to 'Done'"
            )
            requestObj.setSubRequestStatus(index, "removal", "Done")

        return S_OK(requestObj)
Beispiel #7
0
    def replicaRemoval(self, index, requestObj, subRequestAttrs,
                       subRequestFiles):
        """ action for 'replicaRemoval' operation

    :param self: self reference
    :param int index: subRequest index in execution order
    :param RequestContainer requestObj: request
    :param dict subRequestAttrs: subRequest's attributes
    :param dict subRequestFiles: subRequest's files

    TODO: add bulk removal first
    """
        self.info("replicaRemoval: processing subrequest %s" % index)
        if requestObj.isSubRequestEmpty(index, "removal")["Value"]:
            self.info(
                "replicaRemoval: subrequest %s is empty, setting its status to 'Done'"
                % index)
            requestObj.setSubRequestStatus(index, "removal", "Done")
            return S_OK(requestObj)

        targetSEs = list(
            set([
                targetSE.strip()
                for targetSE in subRequestAttrs["TargetSE"].split(",")
                if targetSE.strip()
            ]))
        lfns = [
            str(subRequestFile["LFN"]) for subRequestFile in subRequestFiles
            if subRequestFile["Status"] == "Waiting"
            and str(subRequestFile["LFN"])
        ]

        self.debug(
            "replicaRemoval: found %s lfns to delete from %s sites (%s replicas)"
            % (len(lfns), len(targetSEs), len(lfns) * len(targetSEs)))
        self.addMark("ReplicaRemovalAtt", len(lfns) * len(targetSEs))
        removalStatus = {}

        # # loop over LFNs
        for lfn in lfns:
            self.info("replicaRemoval: processing file %s" % lfn)
            # # prepare status dict
            removalStatus[lfn] = dict.fromkeys(targetSEs, "")
            # # loop over targetSEs
            try:
                for targetSE in targetSEs:
                    # # try to remove using current proxy
                    removeReplica = self.replicaManager().removeReplica(
                        targetSE, lfn)
                    # # file is not existing?
                    if not removeReplica[
                            "OK"] and "no such file or directory" in str(
                                removeReplica["Message"]).lower():
                        removalStatus[lfn][targetSE] = removeReplica["Message"]
                        continue

                    # # not OK but request belongs to DataManager?
                    if not self.requestOwnerDN and \
                          ( not removeReplica["OK"] and
                            "Write access not permitted for this credential." in removeReplica["Message"] ) or \
                          ( removeReplica["OK"] and "Failed" in removeReplica["Value"] and
                            lfn in removeReplica["Value"]["Failed"] and
                            "permission denied" in str( removeReplica["Value"]["Failed"][lfn] ).lower() ):
                        # # get proxy for LFN
                        getProxyForLFN = self.getProxyForLFN(lfn)
                        # # can't get correct proxy?
                        if not getProxyForLFN["OK"]:
                            self.warn(
                                "replicaRemoval: unable to get proxy for file %s: %s"
                                % (lfn, getProxyForLFN["Message"]))
                            removeReplica = getProxyForLFN
                        else:
                            # # got correct proxy? try to remove again
                            removeReplica = self.replicaManager(
                            ).removeReplica(targetSE, lfn)

                    if not removeReplica["OK"]:
                        removalStatus[lfn][targetSE] = removeReplica["Message"]
                        continue
                    removeReplica = removeReplica["Value"]
                    # # check failed status for missing files
                    if lfn in removeReplica["Failed"]:
                        removalStatus[lfn][targetSE] = removeReplica["Failed"][
                            lfn]
            finally:
                # # make sure DataManager proxy is set back in place
                if not self.requestOwnerDN and self.dataManagerProxy():
                    # # remove temp proxy
                    if os.environ["X509_USER_PROXY"] != self.dataManagerProxy(
                    ):
                        os.unlink(os.environ["X509_USER_PROXY"])
                    # # put back DataManager proxy
                    os.environ["X509_USER_PROXY"] = self.dataManagerProxy()

        replicasRemoved = 0
        replicasFailed = 0
        subRequestError = []

        # # filter out missing files
        for lfn, pTargetSEs in removalStatus.items():
            for targetSE, error in pTargetSEs.items():
                if "no such file or directory" in str(error).lower():
                    removalStatus[lfn][targetSE] = ""
        # # loop over statuses and errors
        for lfn, pTargetSEs in removalStatus.items():

            failed = [(targetSE, error)
                      for targetSE, error in pTargetSEs.items() if error != ""]
            successful = [(targetSE, error)
                          for targetSE, error in pTargetSEs.items()
                          if error == ""]

            replicasRemoved += len(successful)
            replicasFailed += len(failed)

            if not failed:
                self.info("replicaRemoval: successfully removed %s from %s" %
                          (lfn, str(targetSEs)))
                updateStatus = requestObj.setSubRequestFileAttributeValue(
                    index, "removal", lfn, "Status", "Done")
                if not updateStatus["OK"]:
                    self.error(
                        "replicaRemoval: error setting status to 'Done' for %s"
                        % lfn)
                continue

            for targetSE, error in failed:
                self.warn("replicaRemoval: failed to remove %s from %s: %s" %
                          (lfn, targetSE, error))

            fileError = ";".join([
                "%s:%s" % (targetSE, str(error).replace("'", ""))
                for targetSE, error in failed
            ])[:255]
            subRequestError.append(fileError)
            fileError = requestObj.setSubRequestFileAttributeValue(
                index, "removal", lfn, "Error", fileError)
            if not fileError["OK"]:
                self.error("replicaRemoval: unable to set Error for %s: %s" %
                           (lfn, fileError["Message"]))

        self.addMark("ReplicaRemovalDone", replicasRemoved)
        self.addMark("ReplicaRemovalFail", replicasFailed)

        # # no 'Waiting' files or all 'Done'
        if requestObj.isSubRequestDone(index, "removal")["Value"]:
            self.info(
                "replicaRemoval: all files processed, setting subrequest status to 'Done'"
            )
            requestObj.setSubRequestStatus(index, "removal", "Done")
        elif replicasFailed:
            self.info(
                "replicaRemoval: all files processed, failed to remove %s replicas"
                % replicasFailed)
            subRequestError = ";".join(subRequestError).replace("'", "")[:255]
            subRequestError = requestObj.setSubRequestAttributeValue(
                index, "removal", "Error", subRequestError)

        # # return requestObj at least
        return S_OK(requestObj)
Beispiel #8
0
    def execute(self):
        """ The main agent execution method
    """
        limitDate = date() - self._period
        limitDateString = toString(limitDate)
        tableList = [
            "MessageRepository", "FixedTextMessages", "Systems", "SubSystems"
        ]
        columnsList = [
            "SystemName", "SubSystemName", "count(*) as entries",
            "FixedTextString"
        ]
        cmd = "SELECT " + ', '.join( columnsList ) + " FROM " \
              + " NATURAL JOIN ".join( tableList ) \
              + " WHERE MessageTime > '%s'" % limitDate \
              + " GROUP BY FixedTextString HAVING entries > %s" % self._threshold \
              + " ORDER BY entries DESC LIMIT %i;" % self._limit

        result = self.SystemLoggingDB._query(cmd)
        if not result['OK']:
            return result

        messageList = result['Value']

        if messageList == 'None' or messageList == ():
            self.log.warn('The DB query returned an empty result')
            return S_OK()

        mailBody = '\n'
        for message in messageList:
            mailBody = mailBody + "Count: "+str(message[2])+"\tError: '"\
                       + message[3] + "'\tSystem: '" + message[0]\
                       + "'\tSubsystem: '" + message[1] + "'\n"

        mailBody = mailBody + "\n\n-------------------------------------------------------\n"\
                   + "Please do not reply to this mail. It was automatically\n"\
                   + "generated by a Dirac Agent.\n"

        result = self.SystemLoggingDB._getDataFromAgentTable(self.agentName)
        self.log.debug(result)
        if not result['OK']:
            errorString = "Could not get the date when the last mail was sent"
            self.log.error(errorString)
            return S_ERROR(errorString)
        else:
            if len(result['Value']):
                self.log.debug("date value: %s" %
                               fromString(result['Value'][0][0][1:-1]))
                lastMailSentDate = fromString(result['Value'][0][0][1:-1])
            else:
                lastMailSentDate = limitDate - 1 * day
                result = self.SystemLoggingDB._insertDataIntoAgentTable(
                    self.agentName, lastMailSentDate)
                if not result['OK']:
                    errorString = "Could not insert data into the DB"
                    self.log.error(errorString, result['Message'])
                    return S_ERROR(errorString + ": " + result['Message'])

        self.log.debug("limitDate: %s\t" % limitDate \
                       + "lastMailSentDate: %s\n" % lastMailSentDate )
        if lastMailSentDate > limitDate:
            self.log.info( "The previous report was sent less "\
                           +" than %s days ago" % self.__days )
            return S_OK()

        dateSent = toString(date())
        self.log.info("The list with the top errors has been sent")

        result = self.SystemLoggingDB._insertDataIntoAgentTable(
            self.agentName, dateSent)
        if not result['OK']:
            errorString = "Could not insert data into the DB"
            self.log.error(errorString, result['Message'])
            return S_ERROR(errorString + ": " + result['Message'])

        result = self.notification.sendMail(self._mailAddress, self._subject,
                                            mailBody)
        if not result['OK']:
            self.log.warn("The notification could not be sent")
            return S_OK()

        return S_OK("The list with the top errors has been sent")
Beispiel #9
0
 def put(self, message, parameters=None):
     return S_OK("FakeMQConnection sending message: " + str(message))
Beispiel #10
0
 def disconnect(self):
     return S_OK("FakeMQConnection disconnecting")
 def initializeHandler(cls, serviceInfo):
     """ Handler initialization
 """
     cls.upDB = UserProfileDB()
     return S_OK()
Beispiel #12
0
 def _userjobmodules(self, stepdefinition):
     res1 = self._setApplicationModuleAndParameters(stepdefinition)
     res2 = self._setUserJobFinalization(stepdefinition)
     if not res1["OK"] or not res2["OK"]:
         return S_ERROR('userjobmodules failed')
     return S_OK()
Beispiel #13
0
 def _setStepParametersValues(self, instance):
     self._setBaseStepParametersValues(instance)
     for depn, depv in self.dependencies.items():
         self._job._addSoftware(depn, depv)
     return S_OK()
Beispiel #14
0
 def _addParametersToStep(self, stepdefinition):
     res = self._addBaseParameters(stepdefinition)
     if not res["OK"]:
         return S_ERROR("Failed to set base parameters")
     return S_OK()
Beispiel #15
0
 def _prodjobmodules(self, stepdefinition):
     res1 = self._setApplicationModuleAndParameters(stepdefinition)
     res2 = self._setOutputComputeDataList(stepdefinition)
     if not res1["OK"] or not res2["OK"]:
         return S_ERROR('prodjobmodules failed')
     return S_OK()
Beispiel #16
0
  def __resolveReplicas( self, seList, replicas, ignoreTape = False ):
    diskSEs = set()
    tapeSEs = set()
    if not seList:
      ret = S_OK()
      ret.update( {'Successful': [], 'Failed': []} )
      return ret

    for localSE in seList:
      seStatus = self.__storageElement( localSE ).getStatus()['Value']
      if seStatus['Read'] and seStatus['DiskSE']:
        diskSEs.add( localSE )
      elif seStatus['Read'] and seStatus['TapeSE']:
        tapeSEs.add( localSE )

    # For the unlikely case that a file is found on two SEs at the same site
    # disk-based replicas are favoured.
    # Problematic files will be returned and can be handled by another module
    failedReplicas = set()
    newReplicasDict = {}
    for lfn, reps in replicas.items():
      if lfn in self.inputData:
        # Check that all replicas are on a valid local SE
        if not [se for se in reps if se in diskSEs.union( tapeSEs )]:
          failedReplicas.add( lfn )
        else:
          for seName in diskSEs & set( reps ):
            newReplicasDict.setdefault( lfn, [] ).append( seName )
          if not newReplicasDict.get( lfn ) and not ignoreTape:
            for seName in tapeSEs & set( reps ):
              newReplicasDict.setdefault( lfn, [] ).append( seName )

    # Check that all LFNs have at least one replica and GUID
    if failedReplicas:
      # in principle this is not a failure but depends on the policy of the VO
      # datasets could be downloaded from another site
      self.log.info( 'The following file(s) were found not to have replicas on any of %s:\n%s' % ( str( seList ), '\n'.join( sorted( failedReplicas ) ) ) )

    # Need to group files by SE in order to stage optimally
    # we know from above that all remaining files have a replica
    # (preferring disk if >1) in the local storage.
    # IMPORTANT, only add replicas for input data that is requested
    # since this module could have been executed after another.
    seFilesDict = {}
    for lfn, seList in newReplicasDict.items():
      for seName in seList:
        seFilesDict.setdefault( seName, [] ).append( lfn )

    sortedSEs = sorted( [ ( len( lfns ), seName ) for seName, lfns in seFilesDict.items() ], reverse = True )

    trackLFNs = {}
    for _len, seName in sortedSEs:
      for lfn in seFilesDict[seName]:
        if 'Size' in replicas[lfn] and 'GUID' in replicas[lfn]:
          trackLFNs.setdefault( lfn, [] ).append( { 'pfn': replicas.get( lfn, {} ).get( seName, lfn ), 'se': seName, 'size': replicas[lfn]['Size'], 'guid': replicas[lfn]['GUID'] } )

    self.log.debug( 'Files grouped by SEs are:\n%s' % str( seFilesDict ) )
    for seName, lfns in seFilesDict.items():
      self.log.info( ' %s LFNs found from catalog at SE %s' % ( len( lfns ), seName ) )
      self.log.verbose( '\n'.join( lfns ) )

    # Can now start to obtain TURLs for files grouped by localSE
    # for requested input data
    requestedProtocol = self.configuration.get( 'Protocol', '' )
    for seName, lfns in seFilesDict.items():
      if not lfns:
        continue
      failedReps = set()
      result = self.__storageElement( seName ).getFileMetadata( lfns )
      if not result['OK']:
        self.log.error( "Error getting metadata.", result['Message'] + ':\n%s' % '\n'.join( lfns ) )
        # If we can not get MetaData, most likely there is a problem with the SE
        # declare the replicas failed and continue
        failedReps.update( lfns )
        continue
      failed = result['Value']['Failed']
      if failed:
        # If MetaData can not be retrieved for some PFNs
        # declared them failed and go on
        for lfn in failed:
          lfns.remove( lfn )
          if type( failed ) == type( {} ):
            self.log.error( failed[ lfn ], lfn )
          failedReps.add( lfn )
      for lfn, metadata in result['Value']['Successful'].items():
        if metadata['Lost']:
          error = "File has been Lost by the StorageElement %s" % seName
        elif metadata['Unavailable']:
          error = "File is declared Unavailable by the StorageElement %s" % seName
        elif seName in tapeSEs and not metadata['Cached']:
          error = "File is no longer in StorageElement %s Cache" % seName
        else:
          error = ''
        if error:
          lfns.remove( lfn )
          self.log.error( error, lfn )
          # If PFN is not available
          # declared it failed and go on
          failedReps.add( lfn )

      if None in failedReps:
        failedReps.remove( None )
      if not failedReps:
        self.log.info( 'Preliminary checks OK, getting TURLS at %s for:\n%s' % ( seName, '\n'.join( lfns ) ) )
      else:
        self.log.warn( "Errors during preliminary checks for %d files" % len( failedReps ) )

      result = self.__storageElement( seName ).getAccessUrl( lfns, protocol = requestedProtocol )
      if not result['OK']:
        self.log.error( "Error getting TURLs", result['Message'] )
        return result

      badTURLCount = 0
      badTURLs = []
      seResult = result['Value']

      for lfn, cause in seResult['Failed'].items():
        badTURLCount += 1
        badTURLs.append( 'Failed to obtain TURL for %s: %s' % ( lfn, cause ) )
        failedReps.add( lfn )

      if badTURLCount:
        self.log.warn( 'Found %s problematic TURL(s) for job %s' % ( badTURLCount, self.jobID ) )
        param = '\n'.join( badTURLs )
        self.log.info( param )
        result = self.__setJobParam( 'ProblematicTURLs', param )
        if not result['OK']:
          self.log.warn( "Error setting job param", result['Message'] )

      failedReplicas.update( failedReps )
      for lfn, turl in seResult['Successful'].items():
        for track in trackLFNs[lfn]:
          if track['se'] == seName:
            track['turl'] = turl
            break
        self.log.info( 'Resolved input data\n>>>> SE: %s\n>>>>LFN: %s\n>>>>TURL: %s' %
                       ( seName, lfn, turl ) )
      ##### End of loop on SE #######

    # Check if the files were actually resolved (i.e. have a TURL)
    # If so, remove them from failed list
    for lfn, mdataList in trackLFNs.items():
      for mdata in list( mdataList ):
        if 'turl' not in mdata:
          mdataList.remove( mdata )
          self.log.info( 'No TURL resolved for %s at %s' % ( lfn, mdata['se'] ) )
      if not mdataList:
        trackLFNs.pop( lfn, None )
        failedReplicas.add( lfn )
      elif lfn in failedReplicas:
        failedReplicas.remove( lfn )
    self.log.debug( 'All resolved data', sorted( trackLFNs ) )
    self.log.debug( 'All failed data', sorted( failedReplicas ) )

    ret = S_OK()
    ret.update( {'Successful': trackLFNs, 'Failed': sorted( failedReplicas )} )
    return ret
Beispiel #17
0
def initSEs():
    '''
    Initializes SEs statuses taking their values from the CS.
  '''
    from DIRAC.ResourceStatusSystem.Client import ResourceStatusClient
    from DIRAC.ResourceStatusSystem.PolicySystem import StateMachine
    from DIRAC.ResourceStatusSystem.Utilities import CSHelpers, RssConfiguration

    #WarmUp local copy
    CSHelpers.warmUp()

    subLogger.info('Initializing SEs')

    rssClient = ResourceStatusClient.ResourceStatusClient()

    ses = CSHelpers.getStorageElements()
    if not ses['OK']:
        return ses
    ses = ses['Value']

    statuses = StateMachine.RSSMachine(None).getStates()
    statusTypes = RssConfiguration.RssConfiguration().getConfigStatusType(
        'StorageElement')
    reason = 'dirac-rss-sync'

    subLogger.debug(statuses)
    subLogger.debug(statusTypes)

    for se in ses:

        subLogger.debug(se)

        opts = gConfig.getOptionsDict('/Resources/StorageElements/%s' % se)
        if not opts['OK']:
            subLogger.warn(opts['Message'])
            continue
        opts = opts['Value']

        subLogger.debug(opts)

        # We copy the list into a new object to remove items INSIDE the loop !
        statusTypesList = statusTypes[:]

        for statusType, status in opts.iteritems():

            #Sanity check...
            if not statusType in statusTypesList:
                continue

            #Transforms statuses to RSS terms
            if status in ('NotAllowed', 'InActive'):
                status = 'Banned'

            if not status in statuses:
                subLogger.error('%s not a valid status for %s - %s' %
                                (status, se, statusType))
                continue

            # We remove from the backtracking
            statusTypesList.remove(statusType)

            subLogger.debug([se, statusType, status, reason])
            result = rssClient.addOrModifyStatusElement(
                'Resource',
                'Status',
                name=se,
                statusType=statusType,
                status=status,
                elementType='StorageElement',
                reason=reason)

            if not result['OK']:
                subLogger.error('Failed to modify')
                subLogger.error(result['Message'])
                continue

        #Backtracking: statusTypes not present on CS
        for statusType in statusTypesList:

            result = rssClient.addOrModifyStatusElement(
                'Resource',
                'Status',
                name=se,
                statusType=statusType,
                status=DEFAULT_STATUS,
                elementType='StorageElement',
                reason=reason)
            if not result['OK']:
                subLogger.error('Error in backtracking for %s,%s,%s' %
                                (se, statusType, status))
                subLogger.error(result['Message'])

    return S_OK()
Beispiel #18
0
 def initializeOptimizer(cls):
     """Initialize specific parameters for JobSanityAgent."""
     cls.sandboxClient = SandboxStoreClient(useCertificates=True, smdb=True)
     return S_OK()
Beispiel #19
0
    def removeFile(self, index, requestObj, subRequestAttrs, subRequestFiles):
        """ action for 'removeFile' operation

    :param self: self reference
    :param int index: subRequest index in execution order
    :param RequestContainer requestObj: request
    :param dict subRequestAttrs: subRequest's attributes
    :param dict subRequestFiles: subRequest's files
    """
        self.info("removeFile: processing subrequest %s" % index)
        if requestObj.isSubRequestEmpty(index, "removal")["Value"]:
            self.info(
                "removeFile: subrequest %s is empty, setting its status to 'Done'"
                % index)
            requestObj.setSubRequestStatus(index, "removal", "Done")
            return S_OK(requestObj)

        lfns = [
            str(subRequestFile["LFN"]) for subRequestFile in subRequestFiles
            if subRequestFile["Status"] == "Waiting"
            and str(subRequestFile["LFN"])
        ]
        self.debug("removeFile: about to remove %d files" % len(lfns))
        # # keep removal status for each file
        removalStatus = dict.fromkeys(lfns, "")
        self.addMark("RemoveFileAtt", len(lfns))

        # # bulk removal 1st
        bulkRemoval = self.replicaManager().removeFile(lfns)
        if not bulkRemoval["OK"]:
            self.error("removeFile: unable to remove files: %s" %
                       bulkRemoval["Message"])
            subRequestError = bulkRemoval["Message"][:255]
            subRequestError = requestObj.setSubRequestAttributeValue(
                index, "removal", "Error", subRequestError)
            return S_OK(requestObj)
        bulkRemoval = bulkRemoval["Value"]
        successfulLfns = bulkRemoval[
            "Successful"] if "Successful" in bulkRemoval else []
        failedLfns = bulkRemoval["Failed"] if "Failed" in bulkRemoval else []
        toRemove = []
        for lfn in removalStatus:
            if lfn in failedLfns and "no such file or directory" in str(
                    bulkRemoval["Failed"][lfn]).lower():
                removalStatus[lfn] = bulkRemoval["Failed"][lfn]
                removeCatalog = self.replicaManager().removeCatalogFile(
                    lfn, singleFile=True)
                if not removeCatalog["OK"]:
                    removalStatus[lfn] = removeCatalog["Message"]
                    continue
            else:
                toRemove.append(lfn)

        # # loop over LFNs to remove
        for lfn in toRemove:
            self.debug("removeFile: processing file %s" % lfn)
            try:
                # # try to remove using proxy already defined in os.environ
                removal = self.replicaManager().removeFile(lfn)
                # # file is not existing?
                if not removal["OK"] and "no such file or directory" in str(
                        removal["Message"]).lower():
                    removalStatus[lfn] = removal["Message"]
                    continue
                # # not OK but request belongs to DataManager?
                if not self.requestOwnerDN and \
                   ( not removal["OK"] and "Write access not permitted for this credential." in removal["Message"] ) or \
                   ( removal["OK"] and "Failed" in removal["Value"] and
                     lfn in removal["Value"]["Failed"] and
                     "permission denied" in str( removal["Value"]["Failed"][lfn] ).lower() ):
                    self.debug("removeFile: retrieving proxy for %s" % lfn)
                    getProxyForLFN = self.getProxyForLFN(lfn)
                    # # can't get correct proxy? continue...
                    if not getProxyForLFN["OK"]:
                        self.warn(
                            "removeFile: unable to get proxy for file %s: %s" %
                            (lfn, getProxyForLFN["Message"]))
                        removal = getProxyForLFN
                    else:
                        # # you're a DataManager, retry with the new one proxy
                        removal = self.replicaManager().removeFile(lfn)
            finally:
                # # make sure DataManager proxy is set back in place
                if not self.requestOwnerDN and self.dataManagerProxy():
                    # # remove temp proxy
                    if os.environ["X509_USER_PROXY"] != self.dataManagerProxy(
                    ):
                        os.unlink(os.environ["X509_USER_PROXY"])
                    # # put back DataManager proxy
                    os.environ["X509_USER_PROXY"] = self.dataManagerProxy()

            # # save error
            if not removal["OK"]:
                removalStatus[lfn] = removal["Message"]
                continue
            # # check fail reason, filter out missing files
            removal = removal["Value"]
            if lfn in removal["Failed"]:
                removalStatus[lfn] = removal["Failed"][lfn]

        # # counters
        filesRemoved = 0
        filesFailed = 0
        subRequestError = []
        # # update File statuses and errors
        for lfn, error in removalStatus.items():

            # # set file error if any
            if error:
                self.debug("removeFile: %s: %s" % (lfn, str(error)))
                fileError = str(error).replace("'", "")[:255]
                fileError = requestObj.setSubRequestFileAttributeValue(
                    index, "removal", lfn, "Error", fileError)
                if not fileError["OK"]:
                    self.error("removeFile: unable to set Error for %s: %s" %
                               (lfn, fileError["Message"]))
            # # no error? file not exists? - we are able to recover
            if not error or "no such file or directory" in str( error ).lower() or \
                  "file does not exist" in str( error ).lower():
                filesRemoved += 1
                self.info("removeFile: successfully removed %s" % lfn)
                updateStatus = requestObj.setSubRequestFileAttributeValue(
                    index, "removal", lfn, "Status", "Done")
                if not updateStatus["OK"]:
                    self.error(
                        "removeFile: unable to change status to 'Done' for %s"
                        % lfn)
            else:
                filesFailed += 1
                self.warn("removeFile: unable to remove file %s : %s" %
                          (lfn, error))
                errorStr = str(error)
                if type(error) == type({}):
                    errorStr = ";".join([
                        "%s:%s" % (key, value) for key, value in error.items()
                    ])
                errorStr = errorStr.replace("'", "")
                subRequestError.append("%s:%s" % (lfn, errorStr))

        self.addMark("RemoveFileDone", filesRemoved)
        self.addMark("RemoveFileFail", filesFailed)

        # # all 'Done'?
        if requestObj.isSubRequestDone(index, "removal")["Value"]:
            self.info(
                "removeFile: all files processed, setting subrequest status to 'Done'"
            )
            requestObj.setSubRequestStatus(index, "removal", "Done")
        elif filesFailed:
            self.info(
                "removeFile: all files processed, %s files failed to remove" %
                filesFailed)
            subRequestError = ";".join(subRequestError)[:255]
            subRequestError = requestObj.setSubRequestAttributeValue(
                index, "removal", "Error", subRequestError)
        return S_OK(requestObj)
Beispiel #20
0
    def getResourceUsage(self):
        """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
        cmd = 'qstat -f %s' % (self.jobID)
        result = runCommand(cmd)
        if not result['OK']:
            return result

        cpu = None
        cpuLimit = None
        wallClock = None
        wallClockLimit = None

        lines = str(result['Value']).split('\n')
        for line in lines:
            info = line.split()
            if re.search('.*resources_used.cput.*', line):
                if len(info) >= 3:
                    cpuList = info[2].split(':')
                    newcpu = (float(cpuList[0]) * 60 +
                              float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpu or newcpu > cpu:
                        cpu = newcpu
                else:
                    self.log.warn('Problem parsing "%s" for CPU consumed' %
                                  line)
            if re.search('.*resources_used.pcput.*', line):
                if len(info) >= 3:
                    cpuList = info[2].split(':')
                    newcpu = (float(cpuList[0]) * 60 +
                              float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpu or newcpu > cpu:
                        cpu = newcpu
                else:
                    self.log.warn('Problem parsing "%s" for CPU consumed' %
                                  line)
            if re.search('.*resources_used.walltime.*', line):
                if len(info) >= 3:
                    wcList = info[2].split(':')
                    wallClock = (float(wcList[0]) * 60 +
                                 float(wcList[1])) * 60 + float(wcList[2])
                else:
                    self.log.warn(
                        'Problem parsing "%s" for elapsed wall clock time' %
                        line)
            if re.search('.*Resource_List.cput.*', line):
                if len(info) >= 3:
                    cpuList = info[2].split(':')
                    newcpuLimit = (float(cpuList[0]) * 60 +
                                   float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpuLimit or newcpuLimit < cpuLimit:
                        cpuLimit = newcpuLimit
                else:
                    self.log.warn('Problem parsing "%s" for CPU limit' % line)
            if re.search('.*Resource_List.pcput.*', line):
                if len(info) >= 3:
                    cpuList = info[2].split(':')
                    newcpuLimit = (float(cpuList[0]) * 60 +
                                   float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpuLimit or newcpuLimit < cpuLimit:
                        cpuLimit = newcpuLimit
                else:
                    self.log.warn('Problem parsing "%s" for CPU limit' % line)
            if re.search('.*Resource_List.walltime.*', line):
                if len(info) >= 3:
                    wcList = info[2].split(':')
                    wallClockLimit = (float(wcList[0]) * 60 +
                                      float(wcList[1])) * 60 + float(wcList[2])
                else:
                    self.log.warn('Problem parsing "%s" for wall clock limit' %
                                  line)

        consumed = {
            'CPU': cpu,
            'CPULimit': cpuLimit,
            'WallClock': wallClock,
            'WallClockLimit': wallClockLimit
        }
        self.log.debug(consumed)

        if None not in consumed.values():
            self.log.debug("TimeLeft counters complete:", str(consumed))
            return S_OK(consumed)
        else:
            missed = [key for key, val in consumed.items() if val is None]
            self.log.info('Could not determine parameter', ','.join(missed))
            self.log.debug(
                'This is the stdout from the batch system call\n%s' %
                (result['Value']))

        if cpuLimit or wallClockLimit:
            # We have got a partial result from PBS, assume that we ran for too short time
            if not cpuLimit:
                consumed['CPULimit'] = wallClockLimit * 0.8
            if not wallClockLimit:
                consumed['WallClockLimit'] = cpuLimit / 0.8
            if not cpu:
                consumed['CPU'] = int(time.time() - self.startTime)
            if not wallClock:
                consumed['WallClock'] = int(time.time() - self.startTime)
            self.log.debug("TimeLeft counters restored:", str(consumed))
            return S_OK(consumed)
        else:
            msg = 'Could not determine some parameters'
            self.log.info(
                msg, ':\nThis is the stdout from the batch system call\n%s' %
                (result['Value']))
            retVal = S_ERROR(msg)
            retVal['Value'] = consumed
            return retVal
Beispiel #21
0
    def reTransfer(self, index, requestObj, subRequestAttrs, subRequestFiles):
        """ action for 'reTransfer' operation

    :param self: self reference
    :param int index: subRequest index in execution order
    :param RequestContainer requestObj: request
    :param dict subRequestAttrs: subRequest's attributes
    :param dict subRequestFiles: subRequest's files
    """
        self.info("reTransfer: processing subrequest %s" % index)
        if requestObj.isSubRequestEmpty(index, "removal")["Value"]:
            self.info(
                "reTransfer: subrequest %s is empty, setting its status to 'Done'"
                % index)
            requestObj.setSubRequestStatus(index, "removal", "Done")
            return S_OK(requestObj)
        subRequestError = []

        targetSEs = list(
            set([
                targetSE.strip()
                for targetSE in subRequestAttrs["TargetSE"].split(",")
                if targetSE.strip()
            ]))
        lfnsPfns = [(subFile["LFN"], subFile["PFN"], subFile["Status"])
                    for subFile in subRequestFiles]

        failed = {}
        for lfn, pfn, status in lfnsPfns:
            self.info("reTransfer: processing file %s" % lfn)
            if status != "Waiting":
                self.info("reTransfer: skipping file %s, status is %s" %
                          (lfn, status))
                continue
            failed.setdefault(lfn, {})
            for targetSE in targetSEs:
                reTransfer = self.replicaManager().onlineRetransfer(
                    targetSE, pfn)
                if reTransfer["OK"]:
                    if pfn in reTransfer["Value"]["Successful"]:
                        self.info(
                            "reTransfer: succesfully requested retransfer of %s"
                            % pfn)
                    else:
                        reason = reTransfer["Value"]["Failed"][pfn]
                        self.error(
                            "reTransfer: failed to set retransfer request for %s at %s: %s"
                            % (pfn, targetSE, reason))
                        failed[lfn][targetSE] = reason
                        subRequestError.append("%s:%s:%s" %
                                               (lfn, targetSE, reason))
                else:
                    self.error(
                        "reTransfer: completely failed to retransfer: %s" %
                        reTransfer["Message"])
                    failed[lfn][targetSE] = reTransfer["Message"]
                    subRequestError.append(
                        "%s:%s:%s" % (lfn, targetSE, reTransfer["Message"]))
            if not failed[lfn]:
                self.info(
                    "reTransfer: file %s sucessfully processed at all targetSEs"
                    % lfn)
                requestObj.setSubRequestFileAttributeValue(
                    index, "removal", lfn, "Status", "Done")

        # # subrequest empty or all Files done?
        if requestObj.isSubRequestDone(index, "removal")["Value"]:
            self.info(
                "reTransfer: all files processed, setting subrequest status to 'Done'"
            )
            requestObj.setSubRequestStatus(index, "removal", "Done")
        else:
            subRequestError = requestObj.setSubRequestAttributeValue(
                index, "removal", "Error", ";".join(subRequestError)[:255])
        return S_OK(requestObj)
  def submitJobs( self ):
    """ Go through defined computing elements and submit jobs if necessary
    """

    queues = self.queueDict.keys()

    # Check that there is some work at all
    setup = CSGlobals.getSetup()
    tqDict = { 'Setup':setup,
               'CPUTime': 9999999,
               'SubmitPool' : self.defaultSubmitPools }
    if self.vo:
      tqDict['Community'] = self.vo
    if self.voGroups:
      tqDict['OwnerGroup'] = self.voGroups

    result = Resources.getCompatiblePlatforms( self.platforms )
    if not result['OK']:
      return result
    tqDict['Platform'] = result['Value']
    tqDict['Site'] = self.sites
    tags = []
    for queue in queues:
      tags += self.queueDict[queue]['ParametersDict']['Tags']
    tqDict['Tag'] = list( set( tags ) )

    self.log.verbose( 'Checking overall TQ availability with requirements' )
    self.log.verbose( tqDict )

    rpcMatcher = RPCClient( "WorkloadManagement/Matcher" )
    result = rpcMatcher.getMatchingTaskQueues( tqDict )
    if not result[ 'OK' ]:
      return result
    if not result['Value']:
      self.log.verbose( 'No Waiting jobs suitable for the director' )
      return S_OK()

    jobSites = set()
    anySite = False
    testSites = set()
    totalWaitingJobs = 0
    for tqID in result['Value']:
      if "Sites" in result['Value'][tqID]:
        for site in result['Value'][tqID]['Sites']:
          if site.lower() != 'any':
            jobSites.add( site )
          else:
            anySite = True
      else:
        anySite = True
      if "JobTypes" in result['Value'][tqID]:
        if "Sites" in result['Value'][tqID]:
          for site in result['Value'][tqID]['Sites']:
            if site.lower() != 'any':
              testSites.add( site )
      totalWaitingJobs += result['Value'][tqID]['Jobs']

    tqIDList = result['Value'].keys()
    self.log.info( tqIDList )
    result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList,
                                          'Status': WAITING_PILOT_STATUS },
                                        None )
    tagWaitingPilots = 0
    if result['OK']:
      tagWaitingPilots = result['Value']
    self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % ( totalWaitingJobs, len( tqIDList ), tagWaitingPilots ) )
    self.log.info( 'Queues: ', self.queueDict.keys() )
    # if tagWaitingPilots >= totalWaitingJobs:
    #  self.log.info( 'No more pilots to be submitted in this cycle' )
    #  return S_OK()

    if self.rssFlag:

      result = self.siteClient.getUsableSites()
      if not result['OK']:
        return S_ERROR( 'Can not get the site status' )
      siteMaskList = result['Value']

    else:

      # Use the old way, check if the site is allowed in the mask
      result = jobDB.getSiteMask()
      if not result['OK']:
        return S_ERROR( 'Can not get the site mask' )
      siteMaskList = result['Value']

    random.shuffle( queues )
    totalSubmittedPilots = 0
    matchedQueues = 0
    for queue in queues:

      # Check if the queue failed previously
      failedCount = self.failedQueues[ queue ] % self.failedQueueCycleFactor
      if failedCount != 0:
        self.log.warn( "%s queue failed recently, skipping %d cycles" % ( queue, 10 - failedCount ) )
        self.failedQueues[queue] += 1
        continue

      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      ceType = self.queueDict[queue]['CEType']
      queueName = self.queueDict[queue]['QueueName']
      siteName = self.queueDict[queue]['Site']
      platform = self.queueDict[queue]['Platform']
      queueTags = self.queueDict[queue]['ParametersDict']['Tags']
      siteMask = siteName in siteMaskList
      processorTags = []

      for tag in queueTags:
        if re.match( r'^[0-9]+Processors$', tag ):
          processorTags.append( tag )
      if 'WholeNode' in queueTags:
        processorTags.append( 'WholeNode' )

      if not anySite and siteName not in jobSites:
        self.log.verbose( "Skipping queue %s at %s: no workload expected" % ( queueName, siteName ) )
        continue
      if not siteMask and siteName not in testSites:
        self.log.verbose( "Skipping queue %s at site %s not in the mask" % ( queueName, siteName ) )
        continue

      if 'CPUTime' in self.queueDict[queue]['ParametersDict'] :
        queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] )
      else:
        self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue )
        continue
      if queueCPUTime > self.maxQueueLength:
        queueCPUTime = self.maxQueueLength

      # Prepare the queue description to look for eligible jobs
      ceDict = ce.getParameterDict()
      ceDict[ 'GridCE' ] = ceName
      # if not siteMask and 'Site' in ceDict:
      #  self.log.info( 'Site not in the mask %s' % siteName )
      #  self.log.info( 'Removing "Site" from matching Dict' )
      #  del ceDict[ 'Site' ]
      if not siteMask:
        ceDict['JobType'] = "Test"
      if self.vo:
        ceDict['Community'] = self.vo
      if self.voGroups:
        ceDict['OwnerGroup'] = self.voGroups

      # This is a hack to get rid of !
      ceDict['SubmitPool'] = self.defaultSubmitPools

      result = Resources.getCompatiblePlatforms( platform )
      if not result['OK']:
        continue
      ceDict['Platform'] = result['Value']

      ceDict['Tag'] = processorTags
      # Get the number of eligible jobs for the target site/queue
      result = rpcMatcher.getMatchingTaskQueues( ceDict )
      if not result['OK']:
        self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] )
        return result
      taskQueueDict = result['Value']
      if not taskQueueDict:
        self.log.verbose( 'No matching TQs found for %s' % queue )
        continue

      matchedQueues += 1
      totalTQJobs = 0
      totalTQJobsByProcessors = {}
      tqIDList = taskQueueDict.keys()
      tqIDListByProcessors = {}
      for tq in taskQueueDict:
        if 'Tags' not in taskQueueDict[tq]:
          # skip non multiprocessor tqs
          continue
        for tag in taskQueueDict[tq]['Tags']:
          if tag in processorTags:
            tqIDListByProcessors.setdefault( tag, [] )
            tqIDListByProcessors[tag].append( tq )

            totalTQJobsByProcessors.setdefault( tag, 0 )
            totalTQJobsByProcessors[tag] += taskQueueDict[tq]['Jobs']

        totalTQJobs += taskQueueDict[tq]['Jobs']

      self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % ( totalTQJobs,
                                                                                        len( tqIDList ), queue ) )

      queueSubmittedPilots = 0
      for tag in tqIDListByProcessors:

        self.log.verbose( "Try to submit pilots for Tag=%s (TQs=%s)" % ( tag, tqIDListByProcessors[tag] ) )

        processors = 1

        m = re.match( r'^(?P<processors>[0-9]+)Processors$', tag )
        if m:
          processors = int( m.group( 'processors' ) )
        if tag == 'WholeNode' :
          processors = -1

        tagTQJobs = totalTQJobsByProcessors[tag]
        tagTqIDList = tqIDListByProcessors[tag]

        # Get the number of already waiting pilots for these task queues
        tagWaitingPilots = 0
        if self.pilotWaitingFlag:
          lastUpdateTime = dateTime() - self.pilotWaitingTime * second
          result = pilotAgentsDB.countPilots( {'TaskQueueID': tagTqIDList,
                                               'Status': WAITING_PILOT_STATUS},
                                              None, lastUpdateTime )
          if not result['OK']:
            self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] )
            tagWaitingPilots = 0
          else:
            tagWaitingPilots = result['Value']
            self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots )
        if tagWaitingPilots >= tagTQJobs:
          self.log.verbose( "%d waiting pilots already for all the available jobs" % tagWaitingPilots )
          continue

        self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % ( tagWaitingPilots,
                                                                                           tagTQJobs, queue ) )

        # Get the working proxy
        cpuTime = queueCPUTime + 86400
        self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) )
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime )
        if not result['OK']:
          return result
        self.proxy = result['Value']
        ce.setProxy( self.proxy, cpuTime - 60 )

        # Get the number of available slots on the target site/queue
        totalSlots = self.getQueueSlots( queue, False )
        if totalSlots == 0:
          self.log.debug( '%s: No slots available' % queue )
          continue

        # Note: comparing slots to job numbers is not accurate in multiprocessor case.
        #       This could lead to over submission.
        pilotsToSubmit = max( 0, min( totalSlots, tagTQJobs - tagWaitingPilots ) )
        self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \
                       ( queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit ) )

        # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
        pilotsToSubmit = min( self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit )

        while pilotsToSubmit > 0:
          self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) )

          bundleProxy = self.queueDict[queue].get( 'BundleProxy', False )
          jobExecDir = ''
          jobExecDir = self.queueDict[queue]['ParametersDict'].get( 'JobExecDir', jobExecDir )
          httpProxy = self.queueDict[queue]['ParametersDict'].get( 'HttpProxy', '' )

          result = self.getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir )
          if not result['OK']:
            return result

          executable, pilotSubmissionChunk = result['Value']
          result = ce.submitJob( executable, '', pilotSubmissionChunk, processors = processors )
          # ## FIXME: The condor thing only transfers the file with some
          # ## delay, so when we unlink here the script is gone
          # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts...
          if ceType != 'HTCondorCE':
            os.unlink( executable )
          if not result['OK']:
            self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] )
            pilotsToSubmit = 0
            self.failedQueues[queue] += 1
            continue

          pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
          queueSubmittedPilots += pilotSubmissionChunk
          # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
          # task queue priorities
          pilotList = result['Value']
          self.queueSlots[queue]['AvailableSlots'] -= len( pilotList )
          totalSubmittedPilots += len( pilotList )
          self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) )
          stampDict = {}
          if result.has_key( 'PilotStampDict' ):
            stampDict = result['PilotStampDict']
          tqPriorityList = []
          sumPriority = 0.
          for tq in tagTqIDList:
            sumPriority += taskQueueDict[tq]['Priority']
            tqPriorityList.append( ( tq, sumPriority ) )
          rndm = random.random() * sumPriority
          tqDict = {}
          for pilotID in pilotList:
            rndm = random.random() * sumPriority
            for tq, prio in tqPriorityList:
              if rndm < prio:
                tqID = tq
                break
            if not tqDict.has_key( tqID ):
              tqDict[tqID] = []
            tqDict[tqID].append( pilotID )

          for tqID, pilotList in tqDict.items():
            result = pilotAgentsDB.addPilotTQReference( pilotList,
                                                        tqID,
                                                        self.pilotDN,
                                                        self.pilotGroup,
                                                        self.localhost,
                                                        ceType,
                                                        '',
                                                        stampDict )
            if not result['OK']:
              self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] )
              continue
            for pilot in pilotList:
              result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName,
                                                    'Successfully submitted by the SiteDirector',
                                                     siteName, queueName )
              if not result['OK']:
                self.log.error( 'Failed to set pilot status: ', result['Message'] )
                continue

    self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % ( totalSubmittedPilots, matchedQueues ) )
    return S_OK()
Beispiel #23
0
def pfnparse(pfn):
    """ parse pfn and save all bits of information into dictionary

  :param str pfn: pfn string
  """
    if not pfn:
        return S_ERROR(
            "wrong 'pfn' argument value in function call, expected non-empty string, got %s"
            % str(pfn))
    pfnDict = dict.fromkeys(
        ["Protocol", "Host", "Port", "WSUrl", "Path", "FileName"], "")
    try:
        if ":" not in pfn:
            # pfn = /a/b/c
            pfnDict["Path"] = os.path.dirname(pfn)
            pfnDict["FileName"] = os.path.basename(pfn)
        else:
            # pfn = protocol:/a/b/c
            # pfn = protocol://host/a/b/c
            # pfn = protocol://host:port/a/b/c
            # pfn = protocol://host:port/wsurl?=/a/b/c
            pfnDict["Protocol"] = pfn[0:pfn.index(":")]
            ## remove protocol:
            pfn = pfn[len(pfnDict["Protocol"]):]
            ## remove :// or :
            pfn = pfn[3:] if pfn.startswith("://") else pfn[1:]
            if pfn.startswith("/"):
                ## /a/b/c
                pfnDict["Path"] = os.path.dirname(pfn)
                pfnDict["FileName"] = os.path.basename(pfn)
            else:
                ## host/a/b/c
                ## host:port/a/b/c
                ## host:port/wsurl?=/a/b/c
                if ":" not in pfn:
                    ## host/a/b/c
                    pfnDict["Host"] = pfn[0:pfn.index("/")]
                    pfn = pfn[len(pfnDict["Host"]):]
                    pfnDict["Path"] = os.path.dirname(pfn)
                    pfnDict["FileName"] = os.path.basename(pfn)
                else:
                    ## host:port/a/b/c
                    ## host:port/wsurl?=/a/b/c
                    pfnDict["Host"] = pfn[0:pfn.index(":")]
                    ## port/a/b/c
                    ## port/wsurl?=/a/b/c
                    pfn = pfn[len(pfnDict["Host"]) + 1:]
                    pfnDict["Port"] = pfn[0:pfn.index("/")]
                    ## /a/b/c
                    ## /wsurl?=/a/b/c
                    pfn = pfn[len(pfnDict["Port"]):]
                    WSUrl = pfn.find("?")
                    WSUrlEnd = pfn.find("=")
                    if WSUrl == -1 and WSUrlEnd == -1:
                        ## /a/b/c
                        pfnDict["Path"] = os.path.dirname(pfn)
                        pfnDict["FileName"] = os.path.basename(pfn)
                    else:
                        ## /wsurl?blah=/a/b/c
                        pfnDict["WSUrl"] = pfn[0:WSUrlEnd + 1]
                        ## /a/b/c
                        pfn = pfn[len(pfnDict["WSUrl"]):]
                        pfnDict["Path"] = os.path.dirname(pfn)
                        pfnDict["FileName"] = os.path.basename(pfn)
        return S_OK(pfnDict)
    except Exception:
        errStr = "Pfn.pfnparse: Exception while parsing pfn: " + str(pfn)
        gLogger.exception(errStr)
        return S_ERROR(errStr)
Beispiel #24
0
class RequestAgentBase(object):
    """
  .. class:: RequestAgentBase

  Helper class for DMS agents dealing with RequestContainers and Requests.
  """
    ## reference to ReplicaManager
    __replicaManager = None
    ## reference to DataLoggingClient
    __dataLoggingClient = None
    ## reference to RequestClient
    __requestClient = None
    ## reference to RequestDbMySQL
    __requestDBMySQL = None
    ## reference to TransferDB itself
    __transferDB = None
    ## reference to StotageFactory
    __storageFactory = None

    ##############################################
    # componets getters
    @classmethod
    def replicaManager(cls):
        """ ReplicaManager getter 
    :param cls: class reference
    """
        if not cls.__replicaManager:
            cls.__replicaManager = ReplicaManager()
        return cls.__replicaManager

    @classmethod
    def dataLoggingClient(cls):
        """ DataLoggingClient getter
    :param cls: class reference
    """
        if not cls.__dataLoggingClient:
            cls.__dataLoggingClient = DataLoggingClient()
        return cls.__dataLoggingClient

    @classmethod
    def requestClient(cls):
        """ RequestClient getter
    :param cls: class reference
    """
        if not cls.__requestClient:
            cls.__requestClient = RequestClient()
        return cls.__requestClient

    @classmethod
    def requestDBMySQL(cls):
        """ RequestDBMySQL getter
    :param cls: class reference
    """
        if not cls.__requestDBMySQL:
            cls.__requestDBMySQL = RequestDBMySQL()
        return cls.__requestDBMySQL

    @classmethod
    def transferDB(cls):
        """ TransferDB getter
    :param cls: class reference
    """
        if not cls.__transferDB:
            cls.__transferDB = TransferDB()
        return cls.__transferDB

    @classmethod
    def storageFactory(cls):
        """ StorageFactory getter
    :param cls: class reference
    """
        if not cls.__storageFactory:
            cls.__storageFactory = StorageFactory()
        return cls.__storageFactory

    @classmethod
    def getRequestDict(cls, requestType):
        """ retrive Request of type requestType from RequestDB
        
    :param cls: class reference
    :param str requestType: type of request
    :return: S_ERROR on error
    :return: S_OK with request dictionary::
    
       requestDict = { 
         "requestString" : str,
         "requestName" : str,
         "sourceServer" : str,
         "executionOrder" : list,
         "requestObj" : RequestContainer,
         "jobId" : int }
    """
        ## prepare requestDict
        requestDict = {
            "requestString": None,
            "requestName": None,
            "sourceServer": None,
            "executionOrder": None,
            "requestObj": None,
            "jobId": None
        }
        ## get request out of DB
        res = cls.requestClient().getRequest(requestType)
        if not res["OK"]:
            gLogger.error(res["Message"])
            return res
        elif not res["Value"]:
            msg = "Request of type '%s' not found in RequestDB." % requestType
            gLogger.info(msg)
            return S_OK()
        ## store values
        requestDict["requestName"] = res["Value"]["RequestName"]
        requestDict["requestString"] = res["Value"]["RequestString"]
        requestDict["sourceServer"] = res["Value"]["Server"]
        requestDict["requestObj"] = RequestContainer(
            request=requestDict["requestString"])
        ## get JobID
        try:
            requestDict["jobId"] = int(res["JobID"])
        except ValueError, exc:
            gLogger.warn(
                "Cannot read JobID for request %s, setting it to 0: %s" %
                (requestDict["requestName"], str(exc)))
            requestDict["jobId"] = 0
        ## get the execution order
        res = cls.requestClient().getCurrentExecutionOrder(
            requestDict["requestName"], requestDict["sourceServer"])
        if not res["OK"]:
            msg = "Can not get the execution order for request %s." % requestDict[
                "requestName"]
            gLogger.error(msg, res["Message"])
            return res
        requestDict["executionOrder"] = res["Value"]
        ## return requestDict
        return S_OK(requestDict)
Beispiel #25
0
 def getName(self):
     """ Get the catalog type name
 """
     return S_OK(self.name)
    def _Broadcast(self):
        """ This plug-in takes files found at the sourceSE and broadcasts to all (or a selection of) targetSEs.
    """
        if not self.params:
            return S_ERROR(
                "TransformationPlugin._Broadcast: The 'Broadcast' plugin requires additional parameters."
            )

        targetseParam = self.params['TargetSE']
        targetSEs = []
        sourceSEs = eval(self.params['SourceSE'])
        if targetseParam.count('['):
            targetSEs = eval(targetseParam)
        elif isinstance(targetseParam, list):
            targetSEs = targetseParam
        else:
            targetSEs = [targetseParam]
        # sourceSEs = eval(self.params['SourceSE'])
        # targetSEs = eval(self.params['TargetSE'])
        destinations = int(self.params.get('Destinations', 0))
        if destinations and (destinations >= len(targetSEs)):
            destinations = 0

        status = self.params['Status']
        groupSize = self.params['GroupSize']  # Number of files per tasks

        fileGroups = getFileGroups(self.data)  # groups by SE
        targetSELfns = {}
        for replicaSE, lfns in fileGroups.items():
            ses = replicaSE.split(',')
            # sourceSites = self._getSitesForSEs(ses)
            atSource = False
            for se in ses:
                if se in sourceSEs:
                    atSource = True
            if not atSource:
                continue

            for lfn in lfns:
                targets = []
                sources = self._getSitesForSEs(ses)
                random.shuffle(targetSEs)
                for targetSE in targetSEs:
                    site = self._getSiteForSE(targetSE)['Value']
                    if not site in sources:
                        if (destinations) and (len(targets) >= destinations):
                            continue
                        sources.append(site)
                    targets.append(
                        targetSE
                    )  # after all, if someone wants to copy to the source, it's his choice
                strTargetSEs = str.join(',', sorted(targets))
                if not targetSELfns.has_key(strTargetSEs):
                    targetSELfns[strTargetSEs] = []
                targetSELfns[strTargetSEs].append(lfn)
        tasks = []
        for ses, lfns in targetSELfns.items():
            tasksLfns = breakListIntoChunks(lfns, groupSize)
            for taskLfns in tasksLfns:
                if (status == 'Flush') or (len(taskLfns) >= int(groupSize)):
                    # do not allow groups smaller than the groupSize, except if transformation is in flush state
                    tasks.append((ses, taskLfns))
        return S_OK(tasks)
Beispiel #27
0
  def getStorages(self, storageName, pluginList=None, hideExceptions=False):
    """ Get an instance of a Storage based on the DIRAC SE name based on the CS entries CS

        :param storageName: is the DIRAC SE name i.e. 'CERN-RAW'
        :param pluginList: is an optional list of protocols if a sub-set is desired i.e ['SRM2','SRM1']

        :return: dictionary containing storage elements and information about them
    """
    self.remotePlugins = []
    self.localPlugins = []
    self.name = ''
    self.options = {}
    self.protocols = {}
    self.storages = []
    if pluginList is None:
      pluginList = []
    elif isinstance(pluginList, basestring):
      pluginList = [pluginList]
    if not self.vo:
      gLogger.warn('No VO information available')

    # Get the name of the storage provided
    res = self._getConfigStorageName(storageName, 'Alias')
    if not res['OK']:
      return res
    storageName = res['Value']
    self.name = storageName

    # In case the storage is made from a base SE, get this information
    res = self._getConfigStorageName(storageName, 'BaseSE')
    if not res['OK']:
      return res
    # If the storage is derived frmo another one, keep the information
    # We initialize the seConfigPath to SE_BASE_CONFIG_PATH if there is a derivedSE, SE_CONFIG_PATH if not
    if res['Value'] != storageName:
      derivedStorageName = storageName
      storageName = res['Value']
      seConfigPath = SE_BASE_CONFIG_PATH
    else:
      derivedStorageName = None
      seConfigPath = SE_CONFIG_PATH

    # Get the options defined in the CS for this storage
    res = self._getConfigStorageOptions(storageName, derivedStorageName=derivedStorageName,
                                        seConfigPath=seConfigPath)
    if not res['OK']:
      # This is for the backward compatibility and to invite developer to move their BaseSE in the correct section
      gLogger.warn("Deprecated configuration, you can ignore the error message above."
                   " Please move the baseSE in the correct section: ", SE_BASE_CONFIG_PATH)
      # We change the value of seConfigPath to avoid other errors due to the bad SE_BASE_CONFIG_PATH
      seConfigPath = SE_CONFIG_PATH
      res = self._getConfigStorageOptions(storageName, derivedStorageName=derivedStorageName,
                                          seConfigPath=seConfigPath)
      if not res['OK']:
        return res
    self.options = res['Value']

    # Get the protocol specific details
    res = self._getConfigStorageProtocols(storageName, derivedStorageName=derivedStorageName,
                                          seConfigPath=seConfigPath)
    if not res['OK']:
      return res
    self.protocols = res['Value']

    requestedLocalPlugins = []
    requestedRemotePlugins = []
    requestedProtocolDetails = []
    turlProtocols = []
    # Generate the protocol specific plug-ins
    for protocolSection, protocolDetails in self.protocols.iteritems():
      pluginName = protocolDetails.get('PluginName', protocolSection)
      if pluginList and pluginName not in pluginList:
        continue
      protocol = protocolDetails['Protocol']
      result = self.__generateStorageObject(storageName, pluginName, protocolDetails, hideExceptions=hideExceptions)
      if result['OK']:
        self.storages.append(result['Value'])
        if pluginName in self.localPlugins:
          turlProtocols.append(protocol)
          requestedLocalPlugins.append(pluginName)
        if pluginName in self.remotePlugins:
          requestedRemotePlugins.append(pluginName)
        requestedProtocolDetails.append(protocolDetails)
      else:
        gLogger.info(result['Message'])

    if self.storages:
      resDict = {}
      resDict['StorageName'] = self.name
      resDict['StorageOptions'] = self.options
      resDict['StorageObjects'] = self.storages
      resDict['LocalPlugins'] = requestedLocalPlugins
      resDict['RemotePlugins'] = requestedRemotePlugins
      resDict['ProtocolOptions'] = requestedProtocolDetails
      resDict['TurlProtocols'] = turlProtocols
      return S_OK(resDict)
    else:
      errStr = "StorageFactory.getStorages: Failed to instantiate any storage protocols."
      gLogger.error(errStr, self.name)
      return S_ERROR(errStr)
Beispiel #28
0
  def __call__(self):
    """ remove replicas """

    # The flag  'rmsMonitoring' is set by the RequestTask and is False by default.
    # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase.
    if self.rmsMonitoring:
      self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring")
    else:
      # # gMonitor stuff
      gMonitor.registerActivity("RemoveReplicaAtt", "Replica removals attempted",
                                "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM)
      gMonitor.registerActivity("RemoveReplicaOK", "Successful replica removals",
                                "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM)
      gMonitor.registerActivity("RemoveReplicaFail", "Failed replica removals",
                                "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM)

    # # prepare list of targetSEs
    targetSEs = self.operation.targetSEList
    # # check targetSEs for removal
    bannedTargets = self.checkSEsRSS(targetSEs, access='RemoveAccess')
    if not bannedTargets['OK']:
      if self.rmsMonitoring:
        for status in ["Attempted", "Failed"]:
          self.rmsMonitoringReporter.addRecord(
              self.createRMSRecord(status, len(self.operation))
          )
        self.rmsMonitoringReporter.commit()
      else:
        gMonitor.addMark("RemoveReplicaAtt")
        gMonitor.addMark("RemoveReplicaFail")
      return bannedTargets

    if bannedTargets['Value']:
      return S_OK("%s targets are banned for removal" % ",".join(bannedTargets['Value']))

    # # get waiting files
    waitingFiles = self.getWaitingFilesList()
    # # and prepare dict
    toRemoveDict = dict((opFile.LFN, opFile) for opFile in waitingFiles)

    self.log.info("Todo: %s replicas to delete from %s SEs" % (len(toRemoveDict), len(targetSEs)))

    if self.rmsMonitoring:
      self.rmsMonitoringReporter.addRecord(
          self.createRMSRecord("Attempted", len(toRemoveDict))
      )
    else:
      gMonitor.addMark("RemoveReplicaAtt", len(toRemoveDict) * len(targetSEs))

    # # keep status for each targetSE
    removalStatus = dict.fromkeys(toRemoveDict, None)
    for lfn in removalStatus:
      removalStatus[lfn] = dict.fromkeys(targetSEs, None)

    # # loop over targetSEs
    for targetSE in targetSEs:

      self.log.info("Removing replicas at %s" % targetSE)

      # # 1st step - bulk removal
      bulkRemoval = self._bulkRemoval(toRemoveDict, targetSE)
      if not bulkRemoval["OK"]:
        self.log.error('Bulk replica removal failed', bulkRemoval["Message"])

        if self.rmsMonitoring:
          self.rmsMonitoringReporter.commit()

        return bulkRemoval

      # # report removal status for successful files
      if self.rmsMonitoring:
        self.rmsMonitoringReporter.addRecord(
            self.createRMSRecord("Successful", len(([opFile for opFile in toRemoveDict.values()
                                                     if not opFile.Error])))
        )
      else:
        gMonitor.addMark("RemoveReplicaOK", len([opFile for opFile in toRemoveDict.values() if not opFile.Error]))

      # # 2nd step - process the rest again
      toRetry = dict((lfn, opFile) for lfn, opFile in toRemoveDict.items() if opFile.Error)
      for lfn, opFile in toRetry.items():
        self._removeWithOwnerProxy(opFile, targetSE)
        if opFile.Error:
          if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(
                self.createRMSRecord("Failed", 1)
            )
          else:
            gMonitor.addMark("RemoveReplicaFail", 1)
          removalStatus[lfn][targetSE] = opFile.Error
        else:
          if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(
                self.createRMSRecord("Successful", 1)
            )
          else:
            gMonitor.addMark("RemoveReplicaOK", 1)

    # # update file status for waiting files
    failed = 0
    for opFile in self.operation:
      if opFile.Status == "Waiting":
        errors = list(set(error for error in removalStatus[opFile.LFN].values() if error))
        if errors:
          opFile.Error = "\n".join(errors)
          # This seems to be the only unrecoverable error
          if "Write access not permitted for this credential" in opFile.Error:
            failed += 1
            opFile.Status = "Failed"
        else:
          opFile.Status = "Done"

    if failed:
      self.operation.Error = "failed to remove %s replicas" % failed

    if self.rmsMonitoring:
      self.rmsMonitoringReporter.commit()

    return S_OK()
Beispiel #29
0
  def renewFromMyProxy( self, userDN, userGroup, lifeTime = False, chain = False ):
    if not lifeTime:
      lifeTime = 43200
    if not self.__useMyProxy:
      return S_ERROR( "myproxy is disabled" )
    #Get the chain
    if not chain:
      retVal = self.__getPemAndTimeLeft( userDN, userGroup )
      if not retVal[ 'OK' ]:
        return retVal
      pemData = retVal[ 'Value' ][0]
      chain = X509Chain()
      retVal = chain.loadProxyFromString( pemData )
      if not retVal[ 'OK' ]:
        return retVal

    originChainLifeTime = chain.getRemainingSecs()[ 'Value' ]
    maxMyProxyLifeTime = self.getMyProxyMaxLifeTime()
    #If we have a chain that's 0.8 of max mplifetime don't ask to mp
    if originChainLifeTime > maxMyProxyLifeTime * 0.8:
      self.log.error( "Skipping myproxy download",
                      "user %s %s  chain has %s secs and requested %s secs" % ( userDN,
                                                                                userGroup,
                                                                                originChainLifeTime,
                                                                                maxMyProxyLifeTime ) )
      return S_OK( chain )

    lifeTime *= 1.3
    if lifeTime > maxMyProxyLifeTime:
      lifeTime = maxMyProxyLifeTime
    self.log.error( "Renewing proxy from myproxy", "user %s %s for %s secs" % ( userDN, userGroup, lifeTime ) )

    myProxy = MyProxy( server = self.getMyProxyServer() )
    retVal = myProxy.getDelegatedProxy( chain, lifeTime )
    if not retVal[ 'OK' ]:
      return retVal
    mpChain = retVal[ 'Value' ]
    retVal = mpChain.getRemainingSecs()
    if not retVal[ 'OK' ]:
      return S_ERROR( "Can't retrieve remaining secs from renewed proxy: %s" % retVal[ 'Message' ] )
    mpChainSecsLeft = retVal['Value']
    if mpChainSecsLeft < originChainLifeTime:
      self.log.info( "Chain downloaded from myproxy has less lifetime than the one stored in the db",
                     "\n Downloaded from myproxy: %s secs\n Stored in DB: %s secs" % ( mpChainSecsLeft, originChainLifeTime ) )
      return S_OK( chain )
    retVal = mpChain.getDIRACGroup()
    if not retVal[ 'OK' ]:
      return S_ERROR( "Can't retrieve DIRAC Group from renewed proxy: %s" % retVal[ 'Message' ] )
    chainGroup = retVal['Value']
    if chainGroup != userGroup:
      return S_ERROR( "Mismatch between renewed proxy group and expected: %s vs %s" % ( userGroup, chainGroup ) )
    retVal = self.storeProxy( userDN, userGroup, mpChain )
    if not retVal[ 'OK' ]:
      self.log.error( "Cannot store proxy after renewal", retVal[ 'Message' ] )
    retVal = myProxy.getServiceDN()
    if not retVal[ 'OK' ]:
      hostDN = userDN
    else:
      hostDN = retVal[ 'Value' ]
    self.logAction( "myproxy renewal", hostDN, "host", userDN, userGroup )
    return S_OK( mpChain )
Beispiel #30
0
    def getPilotMonitorWeb(self, selectDict, sortList, startItem, maxItems):
        """ Get summary of the pilot job information in a standard structure
    """

        resultDict = {}
        last_update = None
        if selectDict.has_key('LastUpdateTime'):
            last_update = selectDict['LastUpdateTime']
            del selectDict['LastUpdateTime']
        if selectDict.has_key('Owner'):
            userList = selectDict['Owner']
            if type(userList) != type([]):
                userList = [userList]
            dnList = []
            for uName in userList:
                uList = getDNForUsername(uName)['Value']
                dnList += uList
            selectDict['OwnerDN'] = dnList
            del selectDict['Owner']
        startDate = selectDict.get('FromDate', None)
        if startDate:
            del selectDict['FromDate']
        # For backward compatibility
        if startDate is None:
            startDate = selectDict.get('LastUpdateTime', None)
            if startDate:
                del selectDict['LastUpdateTime']
        endDate = selectDict.get('ToDate', None)
        if endDate:
            del selectDict['ToDate']

        # Sorting instructions. Only one for the moment.
        if sortList:
            orderAttribute = sortList[0][0] + ":" + sortList[0][1]
        else:
            orderAttribute = None

        # Select pilots for the summary
        result = self.selectPilots(selectDict,
                                   orderAttribute=orderAttribute,
                                   newer=startDate,
                                   older=endDate,
                                   timeStamp='LastUpdateTime')
        if not result['OK']:
            return S_ERROR('Failed to select pilots: ' + result['Message'])

        pList = result['Value']
        nPilots = len(pList)
        resultDict['TotalRecords'] = nPilots
        if nPilots == 0:
            return S_OK(resultDict)

        ini = startItem
        last = ini + maxItems
        if ini >= nPilots:
            return S_ERROR('Item number out of range')
        if last > nPilots:
            last = nPilots
        pilotList = pList[ini:last]

        paramNames = [
            'PilotJobReference', 'OwnerDN', 'OwnerGroup', 'GridType', 'Broker',
            'Status', 'DestinationSite', 'BenchMark', 'ParentID',
            'SubmissionTime', 'PilotID', 'LastUpdateTime', 'CurrentJobID',
            'TaskQueueID', 'GridSite'
        ]

        result = self.getPilotInfo(pilotList, paramNames=paramNames)
        if not result['OK']:
            return S_ERROR('Failed to get pilot info: ' + result['Message'])

        pilotDict = result['Value']
        records = []
        for pilot in pilotList:
            parList = []
            for parameter in paramNames:
                if type(pilotDict[pilot][parameter]) not in [
                        IntType, LongType
                ]:
                    parList.append(str(pilotDict[pilot][parameter]))
                else:
                    parList.append(pilotDict[pilot][parameter])
                if parameter == 'GridSite':
                    gridSite = pilotDict[pilot][parameter]

            # If the Grid Site is unknown try to recover it in the last moment
            if gridSite == "Unknown":
                ce = pilotDict[pilot]['DestinationSite']
                result = getSiteForCE(ce)
                if result['OK']:
                    gridSite = result['Value']
                    del parList[-1]
                    parList.append(gridSite)
            records.append(parList)

        resultDict['ParameterNames'] = paramNames
        resultDict['Records'] = records

        return S_OK(resultDict)
Beispiel #31
0
    def getPilotInfo(self,
                     pilotRef=False,
                     parentId=False,
                     conn=False,
                     paramNames=[],
                     pilotID=False):
        """ Get all the information for the pilot job reference or reference list
    """

        parameters = [
            'PilotJobReference', 'OwnerDN', 'OwnerGroup', 'GridType', 'Broker',
            'Status', 'DestinationSite', 'BenchMark', 'ParentID',
            'OutputReady', 'AccountingSent', 'SubmissionTime', 'PilotID',
            'LastUpdateTime', 'TaskQueueID', 'GridSite', 'PilotStamp', 'Queue'
        ]
        if paramNames:
            parameters = paramNames

        cmd = "SELECT %s FROM PilotAgents" % ", ".join(parameters)
        condSQL = []
        if pilotRef:
            if type(pilotRef) == ListType:
                condSQL.append("PilotJobReference IN (%s)" %
                               ",".join(['"%s"' % x for x in pilotRef]))
            else:
                condSQL.append("PilotJobReference = '%s'" % pilotRef)
        if pilotID:
            if type(pilotID) == ListType:
                condSQL.append("PilotID IN (%s)" %
                               ",".join(['%s' % x for x in pilotID]))
            else:
                condSQL.append("PilotID = '%s'" % pilotID)
        if parentId:
            if type(parentId) == ListType:
                condSQL.append("ParentID IN (%s)" %
                               ",".join(['%s' % x for x in parentId]))
            else:
                condSQL.append("ParentID = %s" % parentId)
        if condSQL:
            cmd = "%s WHERE %s" % (cmd, " AND ".join(condSQL))

        result = self._query(cmd, conn=conn)
        if not result['OK']:
            return result
        if not result['Value']:
            msg = "No pilots found"
            if pilotRef:
                msg += " for PilotJobReference(s): %s" % pilotRef
            if parentId:
                msg += " with parent id: %s" % parentId
            return S_ERROR(msg)

        resDict = {}
        pilotIDs = []
        for row in result['Value']:
            pilotDict = {}
            for i in range(len(parameters)):
                pilotDict[parameters[i]] = row[i]
                if parameters[i] == 'PilotID':
                    pilotIDs.append(row[i])
            resDict[row[0]] = pilotDict

        result = self.getJobsForPilot(pilotIDs)
        if not result['OK']:
            return S_OK(resDict)

        jobsDict = result['Value']
        for pilotRef in resDict:
            pilotInfo = resDict[pilotRef]
            pilotID = pilotInfo['PilotID']
            if pilotID in jobsDict:
                pilotInfo['Jobs'] = jobsDict[pilotID]

        return S_OK(resDict)
Beispiel #32
0
    def getPilotSummaryWeb(self, selectDict, sortList, startItem, maxItems):
        """ Get summary of the pilot jobs status by CE/site in a standard structure
    """

        stateNames = [
            'Submitted', 'Ready', 'Scheduled', 'Waiting', 'Running', 'Done',
            'Aborted'
        ]
        allStateNames = stateNames + ['Done_Empty', 'Aborted_Hour']
        paramNames = ['Site', 'CE'] + allStateNames

        resultDict = {}
        last_update = None
        if selectDict.has_key('LastUpdateTime'):
            last_update = selectDict['LastUpdateTime']
            del selectDict['LastUpdateTime']
        site_select = []
        if selectDict.has_key('GridSite'):
            site_select = selectDict['GridSite']
            if type(site_select) != type([]):
                site_select = [site_select]
            del selectDict['GridSite']

        status_select = []
        if selectDict.has_key('Status'):
            status_select = selectDict['Status']
            if type(status_select) != type([]):
                status_select = [status_select]
            del selectDict['Status']

        expand_site = ''
        if selectDict.has_key('ExpandSite'):
            expand_site = selectDict['ExpandSite']
            site_select = [expand_site]
            del selectDict['ExpandSite']

        start = time.time()
        # Get all the data from the database with various selections
        result = self.getCounters('PilotAgents',
                                  ['GridSite', 'DestinationSite', 'Status'],
                                  selectDict,
                                  newer=last_update,
                                  timeStamp='LastUpdateTime')
        if not result['OK']:
            return result

        last_update = Time.dateTime() - Time.hour
        selectDict['Status'] = 'Aborted'
        resultHour = self.getCounters(
            'PilotAgents', ['GridSite', 'DestinationSite', 'Status'],
            selectDict,
            newer=last_update,
            timeStamp='LastUpdateTime')
        if not resultHour['OK']:
            return resultHour

        last_update = Time.dateTime() - Time.day
        selectDict['Status'] = ['Aborted', 'Done']
        resultDay = self.getCounters('PilotAgents',
                                     ['GridSite', 'DestinationSite', 'Status'],
                                     selectDict,
                                     newer=last_update,
                                     timeStamp='LastUpdateTime')
        if not resultDay['OK']:
            return resultDay
        selectDict['CurrentJobID'] = 0
        selectDict['Status'] = 'Done'
        resultDayEmpty = self.getCounters(
            'PilotAgents', ['GridSite', 'DestinationSite', 'Status'],
            selectDict,
            newer=last_update,
            timeStamp='LastUpdateTime')
        if not resultDayEmpty['OK']:
            return resultDayEmpty

        ceMap = {}
        resMap = getCESiteMapping()
        if resMap['OK']:
            ceMap = resMap['Value']

        # Sort out different counters
        resultDict = {}
        resultDict['Unknown'] = {}
        for attDict, count in result['Value']:
            site = attDict['GridSite']
            ce = attDict['DestinationSite']
            state = attDict['Status']
            if site == 'Unknown' and ce != "Unknown" and ce != "Multiple" and ceMap.has_key(
                    ce):
                site = ceMap[ce]
            if not resultDict.has_key(site):
                resultDict[site] = {}
            if not resultDict[site].has_key(ce):
                resultDict[site][ce] = {}
                for p in allStateNames:
                    resultDict[site][ce][p] = 0

            resultDict[site][ce][state] = count

        for attDict, count in resultDay['Value']:
            site = attDict['GridSite']
            ce = attDict['DestinationSite']
            state = attDict['Status']
            if site == 'Unknown' and ce != "Unknown" and ceMap.has_key(ce):
                site = ceMap[ce]
            if state == "Done":
                resultDict[site][ce]["Done"] = count
            if state == "Aborted":
                resultDict[site][ce]["Aborted"] = count

        for attDict, count in resultDayEmpty['Value']:
            site = attDict['GridSite']
            ce = attDict['DestinationSite']
            state = attDict['Status']
            if site == 'Unknown' and ce != "Unknown" and ceMap.has_key(ce):
                site = ceMap[ce]
            if state == "Done":
                resultDict[site][ce]["Done_Empty"] = count

        for attDict, count in resultHour['Value']:
            site = attDict['GridSite']
            ce = attDict['DestinationSite']
            state = attDict['Status']
            if site == 'Unknown' and ce != "Unknown" and ceMap.has_key(ce):
                site = ceMap[ce]
            if state == "Aborted":
                resultDict[site][ce]["Aborted_Hour"] = count

        records = []
        siteSumDict = {}
        for site in resultDict:
            sumDict = {}
            for state in allStateNames:
                if not sumDict.has_key(state):
                    sumDict[state] = 0
            sumDict['Total'] = 0
            for ce in resultDict[site]:
                itemList = [site, ce]
                total = 0
                for state in allStateNames:
                    itemList.append(resultDict[site][ce][state])
                    sumDict[state] += resultDict[site][ce][state]
                    if state == "Done":
                        done = resultDict[site][ce][state]
                    if state == "Done_Empty":
                        empty = resultDict[site][ce][state]
                    if state == "Aborted":
                        aborted = resultDict[site][ce][state]
                    if state == "Aborted_Hour":
                        aborted_hour = resultDict[site][ce][state]
                    if state != "Aborted_Hour" and state != "Done_Empty":
                        total += resultDict[site][ce][state]

                sumDict['Total'] += total
                # Add the total number of pilots seen in the last day
                itemList.append(total)
                # Add pilot submission efficiency evaluation
                if (done - empty) > 0:
                    eff = float(done) / float(done - empty)
                elif done == 0:
                    eff = 0.
                elif empty == done:
                    eff = 99.
                else:
                    eff = 0.
                itemList.append('%.2f' % eff)
                # Add pilot job efficiency evaluation
                if total > 0:
                    eff = float(total - aborted) / float(total) * 100.
                else:
                    eff = 100.
                itemList.append('%.2f' % eff)

                # Evaluate the quality status of the CE
                if total > 10:
                    if eff < 25.:
                        itemList.append('Bad')
                    elif eff < 60.:
                        itemList.append('Poor')
                    elif eff < 85.:
                        itemList.append('Fair')
                    else:
                        itemList.append('Good')
                else:
                    itemList.append('Idle')

                if len(resultDict[site]) == 1 or expand_site:
                    records.append(itemList)

            if len(resultDict[site]) > 1 and not expand_site:
                itemList = [site, 'Multiple']
                for state in allStateNames + ['Total']:
                    if sumDict.has_key(state):
                        itemList.append(sumDict[state])
                    else:
                        itemList.append(0)
                done = sumDict["Done"]
                empty = sumDict["Done_Empty"]
                aborted = sumDict["Aborted"]
                aborted_hour = sumDict["Aborted_Hour"]
                total = sumDict["Total"]

                # Add pilot submission efficiency evaluation
                if (done - empty) > 0:
                    eff = float(done) / float(done - empty)
                elif done == 0:
                    eff = 0.
                elif empty == done:
                    eff = 99.
                else:
                    eff = 0.
                itemList.append('%.2f' % eff)
                # Add pilot job efficiency evaluation
                if total > 0:
                    eff = float(total - aborted) / float(total) * 100.
                else:
                    eff = 100.
                itemList.append('%.2f' % eff)

                # Evaluate the quality status of the Site
                if total > 10:
                    if eff < 25.:
                        itemList.append('Bad')
                    elif eff < 60.:
                        itemList.append('Poor')
                    elif eff < 85.:
                        itemList.append('Fair')
                    else:
                        itemList.append('Good')
                else:
                    itemList.append('Idle')
                records.append(itemList)

            for state in allStateNames + ['Total']:
                if not siteSumDict.has_key(state):
                    siteSumDict[state] = sumDict[state]
                else:
                    siteSumDict[state] += sumDict[state]

        # Perform site selection
        if site_select:
            new_records = []
            for r in records:
                if r[0] in site_select:
                    new_records.append(r)
            records = new_records

        # Perform status selection
        if status_select:
            new_records = []
            for r in records:
                if r[14] in status_select:
                    new_records.append(r)
            records = new_records

        # Get the Site Mask data
        client = RPCClient('WorkloadManagement/WMSAdministrator')
        result = client.getSiteMask()
        if result['OK']:
            siteMask = result['Value']
            for r in records:
                if r[0] in siteMask:
                    r.append('Yes')
                else:
                    r.append('No')
        else:
            for r in records:
                r.append('Unknown')

        finalDict = {}
        finalDict['TotalRecords'] = len(records)
        finalDict['ParameterNames'] = paramNames + \
                                     ['Total', 'PilotsPerJob', 'PilotJobEff', 'Status', 'InMask']

        # Return all the records if maxItems == 0 or the specified number otherwise
        if maxItems:
            finalDict['Records'] = records[startItem:startItem + maxItems]
        else:
            finalDict['Records'] = records

        done = siteSumDict["Done"]
        empty = siteSumDict["Done_Empty"]
        aborted = siteSumDict["Aborted"]
        aborted_hour = siteSumDict["Aborted_Hour"]
        total = siteSumDict["Total"]

        # Add pilot submission efficiency evaluation
        if (done - empty) > 0:
            eff = float(done) / float(done - empty)
        elif done == 0:
            eff = 0.
        elif empty == done:
            eff = 99.
        else:
            eff = 0.
        siteSumDict['PilotsPerJob'] = '%.2f' % eff
        # Add pilot job efficiency evaluation
        if total > 0:
            eff = float(total - aborted) / float(total) * 100.
        else:
            eff = 100.
        siteSumDict['PilotJobEff'] = '%.2f' % eff

        # Evaluate the overall quality status
        if total > 100:
            if eff < 25.:
                siteSumDict['Status'] = 'Bad'
            elif eff < 60.:
                siteSumDict['Status'] = 'Poor'
            elif eff < 85.:
                siteSumDict['Status'] = 'Fair'
            else:
                siteSumDict['Status'] = 'Good'
        else:
            siteSumDict['Status'] = 'Idle'
        finalDict['Extras'] = siteSumDict

        return S_OK(finalDict)
  def web_getStatisticsData( self ):
    req = self.__request()

    paletteColor = Palette()

    RPC = RPCClient( "WorkloadManagement/WMSAdministrator" )

    selector = self.request.arguments["statsField"][0]

    if selector == 'Site':
      selector = "GridSite"
    if selector == "Computing Element":
      selector = "DestinationSite"
    elif selector == "Owner Group":
      selector = "OwnerGroup"
    elif selector == "Owner":
      selector = "OwnerDN"


    result = yield self.threadTask( RPC.getPilotStatistics, selector, req )
    if not result['OK']:
      if 'FromDate' in req:
        del req['FromDate']

      if 'LastUpdate' in req:
        del req['LastUpdate']

      if 'ToDate' in req:
        del req['ToDate']

      result = yield self.threadTask( RPC.getCounters, "PilotAgents", [selector], req )

      statistics = {}
      if result['OK']:
        for status, count in result['Value']:
          if "OwnerDN" in status:
            userName = getUsernameForDN( status['OwnerDN'] )
            if userName['OK']:
              status['OwnerDN'] = userName['Value']
          statistics[status[selector]] = count

      result = S_OK(statistics)

    if result["OK"]:
      callback = []
      result = dict( result["Value"] )
      keylist = result.keys()
      keylist.sort()
      if selector == "Site":
        tier1 = gConfig.getValue( "/WebApp/PreferredSites", [] )
        if len( tier1 ) > 0:
          tier1.sort()
          for i in tier1:
            if result.has_key( i ):
              countryCode = i.rsplit( ".", 1 )[1]
              callback.append( {"key":i, "value":result[i], "code":countryCode, "color": paletteColor.getColor( countryCode ) } )
      for key in keylist:
        if selector == "Site" and tier1:
          if key not in tier1:
            try:
              countryCode = key.rsplit( ".", 1 )[1]
            except:
              countryCode = "Unknown"
            callback.append( {"key":key, "value":result[key], "code":countryCode, "color": paletteColor.getColor( key ) } )
        elif selector == "Site" and not tier1:
          try:
            countryCode = key.rsplit( ".", 1 )[1]
          except:
            countryCode = "Unknown"
          callback.append( {"key":key, "value":result[key], "code":countryCode, "color": paletteColor.getColor( key ) } )
        else:
          callback.append( {"key":key, "value":result[key], "code":"", "color": paletteColor.getColor( key ) } )
      callback = {"success":"true", "result":callback}
    else:
      callback = {"success":"false", "error":result["Message"]}

    self.finish( callback )
Beispiel #34
0
    def getTimeLeft(self, cpuConsumed=0.0, processors=1):
        """Returns the CPU Time Left for supported batch systems.
        The CPUConsumed is the current raw total CPU.
        """
        # Quit if no norm factor available
        if not self.cpuPower:
            return S_ERROR(
                "/LocalSite/CPUNormalizationFactor not defined for site %s" %
                DIRAC.siteName())

        if not self.batchPlugin:
            return S_ERROR(self.batchError)

        resourceDict = self.batchPlugin.getResourceUsage()
        if not resourceDict["OK"]:
            self.log.warn(
                "Could not determine timeleft for batch system at site %s" %
                DIRAC.siteName())
            return resourceDict

        resources = resourceDict["Value"]
        self.log.debug("self.batchPlugin.getResourceUsage(): %s" %
                       str(resources))
        if not resources.get("CPULimit") and not resources.get(
                "WallClockLimit"):
            # This should never happen
            return S_ERROR("No CPU or WallClock limit obtained")

        # if one of CPULimit or WallClockLimit is missing, compute a reasonable value
        if not resources.get("CPULimit"):
            resources["CPULimit"] = resources["WallClockLimit"] * processors
        elif not resources.get("WallClockLimit"):
            resources["WallClockLimit"] = resources["CPULimit"] / processors

        # if one of CPU or WallClock is missing, compute a reasonable value
        if not resources.get("CPU"):
            resources["CPU"] = resources["WallClock"] * processors
        elif not resources.get("WallClock"):
            resources["WallClock"] = resources["CPU"] / processors

        cpu = float(resources["CPU"])
        cpuLimit = float(resources["CPULimit"])
        wallClock = float(resources["WallClock"])
        wallClockLimit = float(resources["WallClockLimit"])
        batchSystemTimeUnit = resources.get("Unit", "Both")

        # Some batch systems rely on wall clock time and/or cpu time to make allocations
        if batchSystemTimeUnit == "WallClock":
            time = wallClock
            timeLimit = wallClockLimit
        else:
            time = cpu
            timeLimit = cpuLimit

        if time and cpuConsumed > 3600.0 and self.cpuPower:
            # If there has been more than 1 hour of consumed CPU and
            # there is a Normalization set for the current CPU
            # use that value to renormalize the values returned by the batch system
            # NOTE: cpuConsumed is non-zero for call by the JobAgent and 0 for call by the watchdog
            # cpuLimit and cpu may be in the units of the batch system, not real seconds...
            # (in this case the other case won't work)
            # therefore renormalise it using cpuConsumed (which is in real seconds)
            cpuWorkLeft = (timeLimit -
                           time) * self.cpuPower * cpuConsumed / time
        else:
            # FIXME: this is always used by the watchdog... Also used by the JobAgent
            #        if consumed less than 1 hour of CPU
            # It was using self.scaleFactor but this is inconsistent: use the same as above
            # In case the returned cpu and cpuLimit are not in real seconds, this is however rubbish
            cpuWorkLeft = (timeLimit - time) * self.cpuPower

        self.log.verbose("Remaining CPU in normalized units is: %.02f" %
                         cpuWorkLeft)
        return S_OK(cpuWorkLeft)