Beispiel #1
0
def isSameSiteSE(se1, se2):
    """ Check if the 2 SEs are at the same site
  """
    dmsHelper = DMSHelpers()
    site1 = dmsHelper.getLocalSiteForSE(se1).get('Value')
    site2 = dmsHelper.getLocalSiteForSE(se2).get('Value')
    return site1 and site2 and site1 == site2
Beispiel #2
0
def isSameSiteSE( se1, se2 ):
  """ Check if the 2 SEs are at the same site
  """
  dmsHelper = DMSHelpers()
  site1 = dmsHelper.getLocalSiteForSE( se1 ).get( 'Value' )
  site2 = dmsHelper.getLocalSiteForSE( se2 ).get( 'Value' )
  return site1 and site2 and site1 == site2
Beispiel #3
0
class PluginUtilities(object):
  """
  Utility class used by plugins
  """

  def __init__(self, plugin='Standard', transClient=None, dataManager=None, fc=None,
               debug=False, transInThread=None, transID=None):
    """
    c'tor

    Setting defaults
    """
    # clients
    if transClient is None:
      self.transClient = TransformationClient()
    else:
      self.transClient = transClient
    if dataManager is None:
      self.dm = DataManager()
    else:
      self.dm = dataManager
    if fc is None:
      self.fc = FileCatalog()
    else:
      self.fc = fc

    self.dmsHelper = DMSHelpers()

    self.plugin = plugin
    self.transID = transID
    self.params = {}
    self.groupSize = 0
    self.maxFiles = 0
    self.cachedLFNSize = {}
    self.transString = ''
    self.debug = debug
    if transInThread is None:
      self.transInThread = {}
    else:
      self.transInThread = transInThread

    self.log = gLogger.getSubLogger(plugin)

  def logVerbose(self, message, param=''):
    """ logger helper """
    if self.debug:
      self.log.info('(V)' + self.transString + message, param)
    else:
      self.log.verbose(self.transString + message, param)

  def logDebug(self, message, param=''):
    """ logger helper """
    self.log.debug(self.transString + message, param)

  def logInfo(self, message, param=''):
    """ logger helper """
    self.log.info(self.transString + message, param)

  def logWarn(self, message, param=''):
    """ logger helper """
    self.log.warn(self.transString + message, param)

  def logError(self, message, param=''):
    """ logger helper """
    self.log.error(self.transString + message, param)

  def logException(self, message, param='', lException=False):
    """ logger helper """
    self.log.exception(self.transString + message, param, lException)

  def setParameters(self, params):
    """ Set the transformation parameters and extract transID """
    self.params = params
    self.transID = params['TransformationID']
    self.transString = self.transInThread.get(self.transID, ' [NoThread] [%d] ' % self.transID)

  # @timeThis
  def groupByReplicas(self, files, status):
    """
    Generates tasks based on the location of the input data

   :param dict fileReplicas:
              {'/this/is/at.1': ['SE1'],
               '/this/is/at.12': ['SE1', 'SE2'],
               '/this/is/at.2': ['SE2'],
               '/this/is/at_123': ['SE1', 'SE2', 'SE3'],
               '/this/is/at_23': ['SE2', 'SE3'],
               '/this/is/at_4': ['SE4']}

    """
    tasks = []
    nTasks = 0

    if not files:
      return S_OK(tasks)

    files = dict(files)

    # Parameters
    if not self.groupSize:
      self.groupSize = self.getPluginParam('GroupSize', 10)
    flush = (status == 'Flush')
    self.logVerbose(
        "groupByReplicas: %d files, groupSize %d, flush %s" %
        (len(files), self.groupSize, flush))

    # Consider files by groups of SEs, a file is only in one group
    # Then consider files site by site, but a file can now be at more than one site
    for groupSE in (True, False):
      if not files:
        break
      seFiles = getFileGroups(files, groupSE=groupSE)
      self.logDebug("fileGroups set: ", seFiles)

      for replicaSE in sortSEs(seFiles):
        lfns = seFiles[replicaSE]
        if lfns:
          tasksLfns = breakListIntoChunks(lfns, self.groupSize)
          lfnsInTasks = []
          for taskLfns in tasksLfns:
            if flush or (len(taskLfns) >= self.groupSize):
              tasks.append((replicaSE, taskLfns))
              lfnsInTasks += taskLfns
          # In case the file was at more than one site, remove it from the other sites' list
          # Remove files from global list
          for lfn in lfnsInTasks:
            files.pop(lfn)
          if not groupSE:
            # Remove files from other SEs
            for se in [se for se in seFiles if se != replicaSE]:
              seFiles[se] = [lfn for lfn in seFiles[se] if lfn not in lfnsInTasks]
      self.logVerbose(
          "groupByReplicas: %d tasks created (groupSE %s)" %
          (len(tasks) - nTasks, str(groupSE)), "%d files not included in tasks" %
          len(files))
      nTasks = len(tasks)

    return S_OK(tasks)

  def createTasksBySize(self, lfns, replicaSE, fileSizes=None, flush=False):
    """
    Split files in groups according to the size and create tasks for a given SE
    """
    tasks = []
    if fileSizes is None:
      fileSizes = self._getFileSize(lfns).get('Value')
    if fileSizes is None:
      self.logWarn('Error getting file sizes, no tasks created')
      return tasks
    taskLfns = []
    taskSize = 0
    if not self.groupSize:
      # input size in GB converted to bytes
      self.groupSize = float(self.getPluginParam('GroupSize', 1.)) * 1000 * 1000 * 1000
    if not self.maxFiles:
      # FIXME: prepare for chaging the name of the ambiguoug  CS option
      self.maxFiles = self.getPluginParam('MaxFilesPerTask', self.getPluginParam('MaxFiles', 100))
    lfns = sorted(lfns, key=fileSizes.get)
    for lfn in lfns:
      size = fileSizes.get(lfn, 0)
      if size:
        if size > self.groupSize:
          tasks.append((replicaSE, [lfn]))
        else:
          taskSize += size
          taskLfns.append(lfn)
          if (taskSize > self.groupSize) or (len(taskLfns) >= self.maxFiles):
            tasks.append((replicaSE, taskLfns))
            taskLfns = []
            taskSize = 0
    if flush and taskLfns:
      tasks.append((replicaSE, taskLfns))
    if not tasks and not flush and taskLfns:
      self.logVerbose(
          'Not enough data to create a task, and flush not set (%d bytes for groupSize %d)' %
          (taskSize, self.groupSize))
    return tasks

  # @timeThis
  def groupBySize(self, files, status):
    """
    Generate a task for a given amount of data
    """
    tasks = []
    nTasks = 0

    if not len(files):
      return S_OK(tasks)

    files = dict(files)
    # Parameters
    if not self.groupSize:
      # input size in GB converted to bytes
      self.groupSize = float(self.getPluginParam('GroupSize', 1)) * 1000 * 1000 * 1000
    flush = (status == 'Flush')
    self.logVerbose(
        "groupBySize: %d files, groupSize: %d, flush: %s" %
        (len(files), self.groupSize, flush))

    # Get the file sizes
    res = self._getFileSize(files.keys())
    if not res['OK']:
      return res
    fileSizes = res['Value']

    for groupSE in (True, False):
      if not files:
        break
      seFiles = getFileGroups(files, groupSE=groupSE)

      for replicaSE in sorted(seFiles) if groupSE else sortSEs(seFiles):
        lfns = seFiles[replicaSE]
        newTasks = self.createTasksBySize(lfns, replicaSE, fileSizes=fileSizes, flush=flush)
        lfnsInTasks = []
        for task in newTasks:
          lfnsInTasks += task[1]
        tasks += newTasks

        # Remove the selected files from the size cache
        self.clearCachedFileSize(lfnsInTasks)
        if not groupSE:
          # Remove files from other SEs
          for se in [se for se in seFiles if se != replicaSE]:
            seFiles[se] = [lfn for lfn in seFiles[se] if lfn not in lfnsInTasks]
        # Remove files from global list
        for lfn in lfnsInTasks:
          files.pop(lfn)

      self.logVerbose(
          "groupBySize: %d tasks created with groupSE %s" %
          (len(tasks) - nTasks, str(groupSE)))
      self.logVerbose("groupBySize: %d files have not been included in tasks" % len(files))
      nTasks = len(tasks)

    self.logVerbose("Grouped %d files by size" % len(files))
    return S_OK(tasks)

  def getExistingCounters(self, normalise=False, requestedSites=[]):
    res = self.transClient.getCounters('TransformationFiles', ['UsedSE'],
                                       {'TransformationID': self.params['TransformationID']})
    if not res['OK']:
      return res
    usageDict = {}
    for usedDict, count in res['Value']:
      usedSE = usedDict['UsedSE']
      if usedSE != 'Unknown':
        usageDict[usedSE] = count
    if requestedSites:
      siteDict = {}
      for se, count in usageDict.items():
        res = getSitesForSE(se)
        if not res['OK']:
          return res
        for site in res['Value']:
          if site in requestedSites:
            siteDict[site] = count
      usageDict = siteDict.copy()
    if normalise:
      usageDict = self._normaliseShares(usageDict)
    return S_OK(usageDict)

  # @timeThis
  def _getFileSize(self, lfns):
    """ Get file size from a cache, if not from the catalog
    #FIXME: have to fill the cachedLFNSize!
    """
    lfns = list(lfns)
    cachedLFNSize = dict(self.cachedLFNSize)

    fileSizes = {}
    for lfn in [lfn for lfn in lfns if lfn in cachedLFNSize]:
      fileSizes[lfn] = cachedLFNSize[lfn]
    self.logDebug(
        "Found cache hit for File size for %d files out of %d" %
        (len(fileSizes), len(lfns)))
    lfns = [lfn for lfn in lfns if lfn not in cachedLFNSize]
    if lfns:
      fileSizes = self._getFileSizeFromCatalog(lfns, fileSizes)
      if not fileSizes['OK']:
        self.logError(fileSizes['Message'])
        return fileSizes
      fileSizes = fileSizes['Value']
    return S_OK(fileSizes)

  # @timeThis
  def _getFileSizeFromCatalog(self, lfns, fileSizes):
    """
    Get file size from the catalog
    """
    lfns = list(lfns)
    fileSizes = dict(fileSizes)

    res = self.fc.getFileSize(lfns)
    if not res['OK']:
      return S_ERROR("Failed to get sizes for all files: %s" % res['Message'])
    if res['Value']['Failed']:
      errorReason = sorted(set(res['Value']['Failed'].values()))
      self.logWarn("Failed to get sizes for %d files:" % len(res['Value']['Failed']), errorReason)
    fileSizes.update(res['Value']['Successful'])
    self.cachedLFNSize.update((res['Value']['Successful']))
    self.logVerbose("Got size of %d files from catalog" % len(lfns))
    return S_OK(fileSizes)

  def clearCachedFileSize(self, lfns):
    """ Utility function
    """
    for lfn in [lfn for lfn in lfns if lfn in self.cachedLFNSize]:
      self.cachedLFNSize.pop(lfn)

  def getPluginParam(self, name, default=None):
    """ Get plugin parameters using specific settings or settings defined in the CS
        Caution: the type returned is that of the default value
    """
    # get the value of a parameter looking 1st in the CS
    if default is not None:
      valueType = type(default)
    else:
      valueType = None
    # First look at a generic value...
    optionPath = "TransformationPlugins/%s" % (name)
    value = Operations().getValue(optionPath, None)
    self.logVerbose("Default plugin param %s: '%s'" % (optionPath, value))
    # Then look at a plugin-specific value
    optionPath = "TransformationPlugins/%s/%s" % (self.plugin, name)
    value = Operations().getValue(optionPath, value)
    self.logVerbose("Specific plugin param %s: '%s'" % (optionPath, value))
    if value is not None:
      default = value
    # Finally look at a transformation-specific parameter
    value = self.params.get(name, default)
    self.logVerbose(
        "Transformation plugin param %s: '%s'. Convert to %s" %
        (name, value, str(valueType)))
    if valueType and not isinstance(value, valueType):
      if valueType is list:
        try:
          value = ast.literal_eval(value) if value and value != 'None' else []
        # literal_eval('SE-DST') -> ValueError
        # literal_eval('SE_MC-DST') -> SyntaxError
        # Don't ask...
        except (ValueError, SyntaxError):
          value = [val for val in value.replace(' ', '').split(',') if val]

      elif valueType is int:
        value = int(value)
      elif valueType is float:
        value = float(value)
      elif valueType is bool:
        if value in ('False', 'No', 'None', None, 0):
          value = False
        else:
          value = bool(value)
      elif valueType is not str:
        self.logWarn(
            "Unknown parameter type (%s) for %s, passed as string" %
            (str(valueType), name))
    self.logVerbose("Final plugin param %s: '%s'" % (name, value))
    return value

  @staticmethod
  def _normaliseShares(originalShares):
    """ Normalize shares to 1 """
    total = sum(float(share) for share in originalShares.values())
    return dict([(site, 100. * float(share) / total if total else 0.)
                 for site, share in originalShares.items()])

  def uniqueSEs(self, ses):
    """ return a list of SEs that are not physically the same """
    newSEs = []
    for se in ses:
      if not self.isSameSEInList(se, newSEs):
        newSEs.append(se)
    return newSEs

  def isSameSE(self, se1, se2):
    """ Check if 2 SEs are indeed the same.

        :param se1: name of the first StorageElement
        :param se2: name of the second StorageElement

        :returns: True/False if they are considered the same.
                  See :py:mod:`~DIRAC.Resources.Storage.StorageElement.StorageElementItem.isSameSE`
    """
    if se1 == se2:
      return True

    return StorageElement(se1).isSameSE(StorageElement(se2))

  def isSameSEInList(self, se1, seList):
    """ Check if an SE is the same as any in a list """
    if se1 in seList:
      return True
    for se in seList:
      if self.isSameSE(se1, se):
        return True
    return False

  def closerSEs(self, existingSEs, targetSEs, local=False):
    """ Order the targetSEs such that the first ones are closer to existingSEs. Keep all elements in targetSEs
    """
    setTarget = set(targetSEs)
    sameSEs = set([se1 for se1 in setTarget for se2 in existingSEs if self.isSameSE(se1, se2)])
    targetSEs = setTarget - set(sameSEs)
    if targetSEs:
      # Some SEs are left, look for sites
      existingSites = [self.dmsHelper.getLocalSiteForSE(se).get('Value')
                       for se in existingSEs]
      existingSites = set([site for site in existingSites if site])
      closeSEs = set([se for se in targetSEs
                      if self.dmsHelper.getLocalSiteForSE(se).get('Value') in existingSites])
      # print existingSEs, existingSites, targetSEs, closeSEs
      otherSEs = targetSEs - closeSEs
      targetSEs = list(closeSEs)
      random.shuffle(targetSEs)
      if not local and otherSEs:
        otherSEs = list(otherSEs)
        random.shuffle(otherSEs)
        targetSEs += otherSEs
    else:
      targetSEs = []
    return (targetSEs + list(sameSEs)) if not local else targetSEs
Beispiel #4
0
def getFilesToStage(lfnList,
                    jobState=None,
                    checkOnlyTapeSEs=None,
                    jobLog=None):
    """ Utility that returns out of a list of LFNs those files that are offline,
      and those for which at least one copy is online
  """
    if not lfnList:
        return S_OK({
            'onlineLFNs': [],
            'offlineLFNs': {},
            'failedLFNs': [],
            'absentLFNs': {}
        })

    dm = DataManager()
    if isinstance(lfnList, six.string_types):
        lfnList = [lfnList]

    lfnListReplicas = dm.getReplicasForJobs(lfnList, getUrl=False)
    if not lfnListReplicas['OK']:
        return lfnListReplicas

    offlineLFNsDict = {}
    onlineLFNs = {}
    offlineLFNs = {}
    absentLFNs = {}
    failedLFNs = set()
    if lfnListReplicas['Value']['Failed']:
        # Check if files are not existing
        for lfn, reason in lfnListReplicas['Value']['Failed'].items():
            # FIXME: awful check until FC returns a proper error
            if cmpError(reason, errno.ENOENT) or 'No such file' in reason:
                # The file doesn't exist, job must be Failed
                # FIXME: it is not possible to return here an S_ERROR(), return the message only
                absentLFNs[lfn] = S_ERROR(errno.ENOENT,
                                          'File not in FC')['Message']
        if absentLFNs:
            return S_OK({
                'onlineLFNs': list(onlineLFNs),
                'offlineLFNs': offlineLFNsDict,
                'failedLFNs': list(failedLFNs),
                'absentLFNs': absentLFNs
            })
        return S_ERROR("Failures in getting replicas")

    lfnListReplicas = lfnListReplicas['Value']['Successful']
    # If a file is reported here at a tape SE, it is not at a disk SE as we use disk in priority
    # We shall check all file anyway in order to make sure they exist
    seToLFNs = dict()
    for lfn, ses in lfnListReplicas.items():
        for se in ses:
            seToLFNs.setdefault(se, list()).append(lfn)

    if seToLFNs:
        if jobState:
            # Get user name and group from the job state
            userName = jobState.getAttribute('Owner')
            if not userName['OK']:
                return userName
            userName = userName['Value']

            userGroup = jobState.getAttribute('OwnerGroup')
            if not userGroup['OK']:
                return userGroup
            userGroup = userGroup['Value']
        else:
            userName = None
            userGroup = None
        # Check whether files are Online or Offline, or missing at SE
        result = _checkFilesToStage(
            seToLFNs,
            onlineLFNs,
            offlineLFNs,
            absentLFNs,  # pylint: disable=unexpected-keyword-arg
            checkOnlyTapeSEs=checkOnlyTapeSEs,
            jobLog=jobLog,
            proxyUserName=userName,
            proxyUserGroup=userGroup,
            executionLock=True)

        if not result['OK']:
            return result
        failedLFNs = set(lfnList) - set(onlineLFNs) - set(offlineLFNs) - set(
            absentLFNs)

        # Get the online SEs
        dmsHelper = DMSHelpers()
        onlineSEs = set(se for ses in onlineLFNs.values() for se in ses)
        onlineSites = set(
            dmsHelper.getLocalSiteForSE(se).get('Value')
            for se in onlineSEs) - {None}
        for lfn in offlineLFNs:
            ses = offlineLFNs[lfn]
            if len(ses) == 1:
                # No choice, let's go
                offlineLFNsDict.setdefault(ses[0], list()).append(lfn)
                continue
            # Try and get an SE at a site already with online files
            found = False
            if onlineSites:
                # If there is at least one online site, select one
                for se in ses:
                    site = dmsHelper.getLocalSiteForSE(se)
                    if site['OK']:
                        if site['Value'] in onlineSites:
                            offlineLFNsDict.setdefault(se, list()).append(lfn)
                            found = True
                            break
            # No online site found in common, select randomly
            if not found:
                offlineLFNsDict.setdefault(random.choice(ses),
                                           list()).append(lfn)

    return S_OK({
        'onlineLFNs': list(onlineLFNs),
        'offlineLFNs': offlineLFNsDict,
        'failedLFNs': list(failedLFNs),
        'absentLFNs': absentLFNs,
        'onlineSites': onlineSites
    })
Beispiel #5
0
class PluginUtilities(object):
  """
  Utility class used by plugins
  """

  def __init__(self, plugin='Standard', transClient=None, dataManager=None, fc=None,
               debug=False, transInThread=None, transID=None):
    """
    c'tor

    Setting defaults
    """
    # clients
    if transClient is None:
      self.transClient = TransformationClient()
    else:
      self.transClient = transClient
    if dataManager is None:
      self.dm = DataManager()
    else:
      self.dm = dataManager
    if fc is None:
      self.fc = FileCatalog()
    else:
      self.fc = fc

    self.dmsHelper = DMSHelpers()

    self.plugin = plugin
    self.transID = transID
    self.params = {}
    self.groupSize = 0
    self.maxFiles = 0
    self.cachedLFNSize = {}
    self.transString = ''
    self.debug = debug
    if transInThread is None:
      self.transInThread = {}
    else:
      self.transInThread = transInThread

    self.log = gLogger.getSubLogger(self.plugin +
                                    self.transInThread.get(self.transID, ' [NoThread] [%s] ' % self.transID))
    # FIXME: This doesn't work (yet) but should soon, will allow scripts to get the context
    self.log.showHeaders(True)

  def logVerbose(self, message, param=''):
    """ logger helper """
    if self.debug:
      log = gLogger.getSubLogger(self.plugin + ' (V)' +
                                 self.transInThread.get(self.transID, ' [NoThread] [%d] ' % self.transID))
      log.info(message, param)
    else:
      self.log.verbose(message, param)

  def logDebug(self, message, param=''):
    """ logger helper """
    self.log.debug(message, param)

  def logInfo(self, message, param=''):
    """ logger helper """
    self.log.info(message, param)

  def logWarn(self, message, param=''):
    """ logger helper """
    self.log.warn(message, param)

  def logError(self, message, param=''):
    """ logger helper """
    self.log.error(message, param)

  def logException(self, message, param='', lException=False):
    """ logger helper """
    self.log.exception(message, param, lException)

  def setParameters(self, params):
    """ Set the transformation parameters and extract transID """
    self.params = params
    self.transID = params['TransformationID']
    self.log = gLogger.getSubLogger(self.plugin +
                                    self.transInThread.get(self.transID, ' [NoThread] [%d] ' % self.transID))

  # @timeThis
  def groupByReplicas(self, files, status):
    """
    Generates tasks based on the location of the input data

   :param dict fileReplicas:
              {'/this/is/at.1': ['SE1'],
               '/this/is/at.12': ['SE1', 'SE2'],
               '/this/is/at.2': ['SE2'],
               '/this/is/at_123': ['SE1', 'SE2', 'SE3'],
               '/this/is/at_23': ['SE2', 'SE3'],
               '/this/is/at_4': ['SE4']}

    """
    tasks = []
    nTasks = 0

    if not files:
      return S_OK(tasks)

    files = dict(files)

    # Parameters
    if not self.groupSize:
      self.groupSize = self.getPluginParam('GroupSize', 10)
    flush = (status == 'Flush')
    self.logVerbose(
        "groupByReplicas: %d files, groupSize %d, flush %s" %
        (len(files), self.groupSize, flush))

    # Consider files by groups of SEs, a file is only in one group
    # Then consider files site by site, but a file can now be at more than one site
    for groupSE in (True, False):
      if not files:
        break
      seFiles = getFileGroups(files, groupSE=groupSE)
      self.logDebug("fileGroups set: ", seFiles)

      for replicaSE in sortSEs(seFiles):
        lfns = seFiles[replicaSE]
        if lfns:
          tasksLfns = breakListIntoChunks(lfns, self.groupSize)
          lfnsInTasks = []
          for taskLfns in tasksLfns:
            if flush or (len(taskLfns) >= self.groupSize):
              tasks.append((replicaSE, taskLfns))
              lfnsInTasks += taskLfns
          # In case the file was at more than one site, remove it from the other sites' list
          # Remove files from global list
          for lfn in lfnsInTasks:
            files.pop(lfn)
          if not groupSE:
            # Remove files from other SEs
            for se in [se for se in seFiles if se != replicaSE]:
              seFiles[se] = [lfn for lfn in seFiles[se] if lfn not in lfnsInTasks]
      self.logVerbose(
          "groupByReplicas: %d tasks created (groupSE %s)" %
          (len(tasks) - nTasks, str(groupSE)), "%d files not included in tasks" %
          len(files))
      nTasks = len(tasks)

    return S_OK(tasks)

  def createTasksBySize(self, lfns, replicaSE, fileSizes=None, flush=False):
    """
    Split files in groups according to the size and create tasks for a given SE
    """
    tasks = []
    if fileSizes is None:
      fileSizes = self._getFileSize(lfns).get('Value')
    if fileSizes is None:
      self.logWarn('Error getting file sizes, no tasks created')
      return tasks
    taskLfns = []
    taskSize = 0
    if not self.groupSize:
      # input size in GB converted to bytes
      self.groupSize = float(self.getPluginParam('GroupSize', 1.)) * 1000 * 1000 * 1000
    if not self.maxFiles:
      # FIXME: prepare for chaging the name of the ambiguoug  CS option
      self.maxFiles = self.getPluginParam('MaxFilesPerTask', self.getPluginParam('MaxFiles', 100))
    lfns = sorted(lfns, key=fileSizes.get)
    for lfn in lfns:
      size = fileSizes.get(lfn, 0)
      if size:
        if size > self.groupSize:
          tasks.append((replicaSE, [lfn]))
        else:
          taskSize += size
          taskLfns.append(lfn)
          if (taskSize > self.groupSize) or (len(taskLfns) >= self.maxFiles):
            tasks.append((replicaSE, taskLfns))
            taskLfns = []
            taskSize = 0
    if flush and taskLfns:
      tasks.append((replicaSE, taskLfns))
    if not tasks and not flush and taskLfns:
      self.logVerbose(
          'Not enough data to create a task, and flush not set (%d bytes for groupSize %d)' %
          (taskSize, self.groupSize))
    return tasks

  # @timeThis
  def groupBySize(self, files, status):
    """
    Generate a task for a given amount of data
    """
    tasks = []
    nTasks = 0

    if not len(files):
      return S_OK(tasks)

    files = dict(files)
    # Parameters
    if not self.groupSize:
      # input size in GB converted to bytes
      self.groupSize = float(self.getPluginParam('GroupSize', 1)) * 1000 * 1000 * 1000
    flush = (status == 'Flush')
    self.logVerbose(
        "groupBySize: %d files, groupSize: %d, flush: %s" %
        (len(files), self.groupSize, flush))

    # Get the file sizes
    res = self._getFileSize(list(files))
    if not res['OK']:
      return res
    fileSizes = res['Value']

    for groupSE in (True, False):
      if not files:
        break
      seFiles = getFileGroups(files, groupSE=groupSE)

      for replicaSE in sorted(seFiles) if groupSE else sortSEs(seFiles):
        lfns = seFiles[replicaSE]
        newTasks = self.createTasksBySize(lfns, replicaSE, fileSizes=fileSizes, flush=flush)
        lfnsInTasks = []
        for task in newTasks:
          lfnsInTasks += task[1]
        tasks += newTasks

        # Remove the selected files from the size cache
        self.clearCachedFileSize(lfnsInTasks)
        if not groupSE:
          # Remove files from other SEs
          for se in [se for se in seFiles if se != replicaSE]:
            seFiles[se] = [lfn for lfn in seFiles[se] if lfn not in lfnsInTasks]
        # Remove files from global list
        for lfn in lfnsInTasks:
          files.pop(lfn)

      self.logVerbose(
          "groupBySize: %d tasks created with groupSE %s" %
          (len(tasks) - nTasks, str(groupSE)))
      self.logVerbose("groupBySize: %d files have not been included in tasks" % len(files))
      nTasks = len(tasks)

    self.logVerbose("Grouped %d files by size" % len(files))
    return S_OK(tasks)

  def getExistingCounters(self, normalise=False, requestedSites=[]):
    res = self.transClient.getCounters('TransformationFiles', ['UsedSE'],
                                       {'TransformationID': self.params['TransformationID']})
    if not res['OK']:
      return res
    usageDict = {}
    for usedDict, count in res['Value']:
      usedSE = usedDict['UsedSE']
      if usedSE != 'Unknown':
        usageDict[usedSE] = count
    if requestedSites:
      siteDict = {}
      for se, count in usageDict.items():
        res = getSitesForSE(se)
        if not res['OK']:
          return res
        for site in res['Value']:
          if site in requestedSites:
            siteDict[site] = count
      usageDict = siteDict.copy()
    if normalise:
      usageDict = self._normaliseShares(usageDict)
    return S_OK(usageDict)

  # @timeThis
  def _getFileSize(self, lfns):
    """ Get file size from a cache, if not from the catalog
    #FIXME: have to fill the cachedLFNSize!
    """
    lfns = list(lfns)
    cachedLFNSize = dict(self.cachedLFNSize)

    fileSizes = {}
    for lfn in [lfn for lfn in lfns if lfn in cachedLFNSize]:
      fileSizes[lfn] = cachedLFNSize[lfn]
    self.logDebug(
        "Found cache hit for File size for %d files out of %d" %
        (len(fileSizes), len(lfns)))
    lfns = [lfn for lfn in lfns if lfn not in cachedLFNSize]
    if lfns:
      fileSizes = self._getFileSizeFromCatalog(lfns, fileSizes)
      if not fileSizes['OK']:
        self.logError(fileSizes['Message'])
        return fileSizes
      fileSizes = fileSizes['Value']
    return S_OK(fileSizes)

  # @timeThis
  def _getFileSizeFromCatalog(self, lfns, fileSizes):
    """
    Get file size from the catalog
    """
    lfns = list(lfns)
    fileSizes = dict(fileSizes)

    res = self.fc.getFileSize(lfns)
    if not res['OK']:
      return S_ERROR("Failed to get sizes for all files: %s" % res['Message'])
    if res['Value']['Failed']:
      errorReason = sorted(set(res['Value']['Failed'].values()))
      self.logWarn("Failed to get sizes for %d files:" % len(res['Value']['Failed']), errorReason)
    fileSizes.update(res['Value']['Successful'])
    self.cachedLFNSize.update((res['Value']['Successful']))
    self.logVerbose("Got size of %d files from catalog" % len(lfns))
    return S_OK(fileSizes)

  def clearCachedFileSize(self, lfns):
    """ Utility function
    """
    for lfn in [lfn for lfn in lfns if lfn in self.cachedLFNSize]:
      self.cachedLFNSize.pop(lfn)

  def getPluginParam(self, name, default=None):
    """ Get plugin parameters using specific settings or settings defined in the CS
        Caution: the type returned is that of the default value
    """
    # get the value of a parameter looking 1st in the CS
    if default is not None:
      valueType = type(default)
    else:
      valueType = None
    # First look at a generic value...
    optionPath = "TransformationPlugins/%s" % (name)
    value = Operations().getValue(optionPath, None)
    self.logVerbose("Default plugin param %s: '%s'" % (optionPath, value))
    # Then look at a plugin-specific value
    optionPath = "TransformationPlugins/%s/%s" % (self.plugin, name)
    value = Operations().getValue(optionPath, value)
    self.logVerbose("Specific plugin param %s: '%s'" % (optionPath, value))
    if value is not None:
      default = value
    # Finally look at a transformation-specific parameter
    value = self.params.get(name, default)
    self.logVerbose(
        "Transformation plugin param %s: '%s'. Convert to %s" %
        (name, value, str(valueType)))
    if valueType and not isinstance(value, valueType):
      if valueType is list:
        try:
          value = ast.literal_eval(value) if value and value != 'None' else []
        # literal_eval('SE-DST') -> ValueError
        # literal_eval('SE_MC-DST') -> SyntaxError
        # Don't ask...
        except (ValueError, SyntaxError):
          value = [val for val in value.replace(' ', '').split(',') if val]

      elif valueType is int:
        value = int(value)
      elif valueType is float:
        value = float(value)
      elif valueType is bool:
        if value in ('False', 'No', 'None', None, 0):
          value = False
        else:
          value = bool(value)
      elif valueType is not str:
        self.logWarn(
            "Unknown parameter type (%s) for %s, passed as string" %
            (str(valueType), name))
    self.logVerbose("Final plugin param %s: '%s'" % (name, value))
    return value

  @staticmethod
  def _normaliseShares(originalShares):
    """ Normalize shares to 1 """
    total = sum(float(share) for share in originalShares.values())
    return dict([(site, 100. * float(share) / total if total else 0.)
                 for site, share in originalShares.items()])

  def uniqueSEs(self, ses):
    """ return a list of SEs that are not physically the same """
    newSEs = []
    for se in ses:
      if not self.isSameSEInList(se, newSEs):
        newSEs.append(se)
    return newSEs

  def isSameSE(self, se1, se2):
    """ Check if 2 SEs are indeed the same.

        :param se1: name of the first StorageElement
        :param se2: name of the second StorageElement

        :returns: True/False if they are considered the same.
                  See :py:mod:`~DIRAC.Resources.Storage.StorageElement.StorageElementItem.isSameSE`
    """
    if se1 == se2:
      return True

    return StorageElement(se1).isSameSE(StorageElement(se2))

  def isSameSEInList(self, se1, seList):
    """ Check if an SE is the same as any in a list """
    if se1 in seList:
      return True
    for se in seList:
      if self.isSameSE(se1, se):
        return True
    return False

  def closerSEs(self, existingSEs, targetSEs, local=False):
    """ Order the targetSEs such that the first ones are closer to existingSEs. Keep all elements in targetSEs
    """
    setTarget = set(targetSEs)
    sameSEs = set([se1 for se1 in setTarget for se2 in existingSEs if self.isSameSE(se1, se2)])
    targetSEs = setTarget - set(sameSEs)
    if targetSEs:
      # Some SEs are left, look for sites
      existingSites = [self.dmsHelper.getLocalSiteForSE(se).get('Value')
                       for se in existingSEs]
      existingSites = set([site for site in existingSites if site])
      closeSEs = set([se for se in targetSEs
                      if self.dmsHelper.getLocalSiteForSE(se).get('Value') in existingSites])
      # print existingSEs, existingSites, targetSEs, closeSEs
      otherSEs = targetSEs - closeSEs
      targetSEs = list(closeSEs)
      random.shuffle(targetSEs)
      if not local and otherSEs:
        otherSEs = list(otherSEs)
        random.shuffle(otherSEs)
        targetSEs += otherSEs
    else:
      targetSEs = []
    return (targetSEs + list(sameSEs)) if not local else targetSEs

  @staticmethod
  def seParamtoList(inputParam):
    """Transform ``inputParam`` to list.

    :param inputParam: can be string, list, or string representation of list
    :returns: list
    """
    if not inputParam:
      return []
    if inputParam.count('['):
      return eval(inputParam)  # pylint: disable=eval-used
    elif isinstance(inputParam, list):
      return inputParam
    return [inputParam]
Beispiel #6
0
class PluginUtilities(object):
    """
  Utility class used by plugins
  """
    def __init__(self,
                 plugin='Standard',
                 transClient=None,
                 dataManager=None,
                 fc=None,
                 debug=False,
                 transInThread=None,
                 transID=None):
        """
    c'tor

    Setting defaults
    """
        # clients
        if transClient is None:
            self.transClient = TransformationClient()
        else:
            self.transClient = transClient
        if dataManager is None:
            self.dm = DataManager()
        else:
            self.dm = dataManager
        if fc is None:
            self.fc = FileCatalog()
        else:
            self.fc = fc

        self.dmsHelper = DMSHelpers()

        self.plugin = plugin
        self.transID = transID
        self.params = {}
        self.groupSize = 0
        self.maxFiles = 0
        self.cachedLFNSize = {}
        self.transString = ''
        self.debug = debug
        self.seConfig = {}
        if transInThread is None:
            self.transInThread = {}
        else:
            self.transInThread = transInThread

        self.log = gLogger.getSubLogger("%s/PluginUtilities" % plugin)

    def logVerbose(self, message, param=''):
        if self.debug:
            self.log.info('(V)' + self.transString + message, param)
        else:
            self.log.verbose(self.transString + message, param)

    def logDebug(self, message, param=''):
        self.log.debug(self.transString + message, param)

    def logInfo(self, message, param=''):
        self.log.info(self.transString + message, param)

    def logWarn(self, message, param=''):
        self.log.warn(self.transString + message, param)

    def logError(self, message, param=''):
        self.log.error(self.transString + message, param)

    def logException(self, message, param='', lException=False):
        self.log.exception(self.transString + message, param, lException)

    def setParameters(self, params):
        self.params = params
        self.transID = params['TransformationID']
        self.transString = self.transInThread.get(
            self.transID,
            ' [NoThread] [%d] ' % self.transID) + '%s: ' % self.plugin

    @timeThis
    def groupByReplicas(self, files, status):
        """
    Generates tasks based on the location of the input data

   :param dict fileReplicas:
              {'/this/is/at.1': ['SE1'],
               '/this/is/at.12': ['SE1', 'SE2'],
               '/this/is/at.2': ['SE2'],
               '/this/is/at_123': ['SE1', 'SE2', 'SE3'],
               '/this/is/at_23': ['SE2', 'SE3'],
               '/this/is/at_4': ['SE4']}

    """
        tasks = []
        nTasks = 0

        if not len(files):
            return S_OK(tasks)

        files = dict(files)

        # Parameters
        if not self.groupSize:
            self.groupSize = self.getPluginParam('GroupSize', 10)
        flush = (status == 'Flush')
        self.logVerbose("groupByReplicas: %d files, groupSize %d, flush %s" %
                        (len(files), self.groupSize, flush))

        # Consider files by groups of SEs, a file is only in one group
        # Then consider files site by site, but a file can now be at more than one site
        for groupSE in (True, False):
            if not files:
                break
            seFiles = getFileGroups(files, groupSE=groupSE)
            self.logDebug("fileGroups set: ", seFiles)

            for replicaSE in sortSEs(seFiles):
                lfns = seFiles[replicaSE]
                if lfns:
                    tasksLfns = breakListIntoChunks(lfns, self.groupSize)
                    lfnsInTasks = []
                    for taskLfns in tasksLfns:
                        if (flush and not groupSE) or (len(taskLfns) >=
                                                       self.groupSize):
                            tasks.append((replicaSE, taskLfns))
                            lfnsInTasks += taskLfns
                    # In case the file was at more than one site, remove it from the other sites' list
                    # Remove files from global list
                    for lfn in lfnsInTasks:
                        files.pop(lfn)
                    if not groupSE:
                        # Remove files from other SEs
                        for se in [se for se in seFiles if se != replicaSE]:
                            seFiles[se] = [
                                lfn for lfn in seFiles[se]
                                if lfn not in lfnsInTasks
                            ]
            self.logVerbose(
                "groupByReplicas: %d tasks created (groupSE %s), %d files not included in tasks"
                % (len(tasks) - nTasks, str(groupSE), len(files)))
            nTasks = len(tasks)

        return S_OK(tasks)

    def createTasksBySize(self, lfns, replicaSE, fileSizes=None, flush=False):
        """
    Split files in groups according to the size and create tasks for a given SE
    """
        tasks = []
        if fileSizes is None:
            fileSizes = self._getFileSize(lfns).get('Value')
        if fileSizes is None:
            self.logWarn('Error getting file sizes, no tasks created')
            return tasks
        taskLfns = []
        taskSize = 0
        if not self.groupSize:
            self.groupSize = float(
                self.getPluginParam('GroupSize', 1.)
            ) * 1000 * 1000 * 1000  # input size in GB converted to bytes
        if not self.maxFiles:
            self.maxFiles = self.getPluginParam('MaxFiles', 100)
        lfns = sorted(lfns, key=fileSizes.get)
        for lfn in lfns:
            size = fileSizes.get(lfn, 0)
            if size:
                if size > self.groupSize:
                    tasks.append((replicaSE, [lfn]))
                else:
                    taskSize += size
                    taskLfns.append(lfn)
                    if (taskSize > self.groupSize) or (len(taskLfns) >=
                                                       self.maxFiles):
                        tasks.append((replicaSE, taskLfns))
                        taskLfns = []
                        taskSize = 0
        if flush and taskLfns:
            tasks.append((replicaSE, taskLfns))
        return tasks

    @timeThis
    def groupBySize(self, files, status):
        """
    Generate a task for a given amount of data
    """
        tasks = []
        nTasks = 0

        if not len(files):
            return S_OK(tasks)

        files = dict(files)
        # Parameters
        if not self.groupSize:
            self.groupSize = float(self.getPluginParam(
                'GroupSize',
                1)) * 1000 * 1000 * 1000  # input size in GB converted to bytes
        flush = (status == 'Flush')
        self.logVerbose("groupBySize: %d files, groupSize: %d, flush: %s" %
                        (len(files), self.groupSize, flush))

        # Get the file sizes
        res = self._getFileSize(files.keys())
        if not res['OK']:
            return res
        fileSizes = res['Value']

        for groupSE in (True, False):
            if not files:
                break
            seFiles = getFileGroups(files, groupSE=groupSE)

            for replicaSE in sorted(seFiles) if groupSE else sortSEs(seFiles):
                lfns = seFiles[replicaSE]
                newTasks = self.createTasksBySize(lfns,
                                                  replicaSE,
                                                  fileSizes=fileSizes,
                                                  flush=flush)
                lfnsInTasks = []
                for task in newTasks:
                    lfnsInTasks += task[1]
                tasks += newTasks

                # Remove the selected files from the size cache
                self.clearCachedFileSize(lfnsInTasks)
                if not groupSE:
                    # Remove files from other SEs
                    for se in [se for se in seFiles if se != replicaSE]:
                        seFiles[se] = [
                            lfn for lfn in seFiles[se]
                            if lfn not in lfnsInTasks
                        ]
                # Remove files from global list
                for lfn in lfnsInTasks:
                    files.pop(lfn)

            self.logVerbose("groupBySize: %d tasks created with groupSE %s" %
                            (len(tasks) - nTasks, str(groupSE)))
            self.logVerbose(
                "groupBySize: %d files have not been included in tasks" %
                len(files))
            nTasks = len(tasks)

        self.logVerbose("Grouped %d files by size" % len(files))
        return S_OK(tasks)

    def getExistingCounters(self, normalise=False, requestedSites=[]):
        res = self.transClient.getCounters(
            'TransformationFiles', ['UsedSE'],
            {'TransformationID': self.params['TransformationID']})
        if not res['OK']:
            return res
        usageDict = {}
        for usedDict, count in res['Value']:
            usedSE = usedDict['UsedSE']
            if usedSE != 'Unknown':
                usageDict[usedSE] = count
        if requestedSites:
            siteDict = {}
            for se, count in usageDict.items():
                res = getSitesForSE(se)
                if not res['OK']:
                    return res
                for site in res['Value']:
                    if site in requestedSites:
                        siteDict[site] = count
            usageDict = siteDict.copy()
        if normalise:
            usageDict = self._normaliseShares(usageDict)
        return S_OK(usageDict)

    @timeThis
    def _getFileSize(self, lfns):
        """ Get file size from a cache, if not from the catalog
    #FIXME: have to fill the cachedLFNSize!
    """
        lfns = list(lfns)
        cachedLFNSize = dict(self.cachedLFNSize)

        fileSizes = {}
        for lfn in [lfn for lfn in lfns if lfn in cachedLFNSize]:
            fileSizes[lfn] = cachedLFNSize[lfn]
        self.logDebug("Found cache hit for File size for %d files out of %d" %
                      (len(fileSizes), len(lfns)))
        lfns = [lfn for lfn in lfns if lfn not in cachedLFNSize]
        if lfns:
            fileSizes = self._getFileSizeFromCatalog(lfns, fileSizes)
            if not fileSizes['OK']:
                self.logError(fileSizes['Message'])
                return fileSizes
            fileSizes = fileSizes['Value']
        return S_OK(fileSizes)

    @timeThis
    def _getFileSizeFromCatalog(self, lfns, fileSizes):
        """
    Get file size from the catalog
    """
        lfns = list(lfns)
        fileSizes = dict(fileSizes)

        res = self.fc.getFileSize(lfns)
        if not res['OK']:
            return S_ERROR("Failed to get sizes for all files: %s" %
                           res['Message'])
        if res['Value']['Failed']:
            errorReason = sorted(set(res['Value']['Failed'].values()))
            self.logWarn(
                "Failed to get sizes for %d files:" %
                len(res['Value']['Failed']), errorReason)
        fileSizes.update(res['Value']['Successful'])
        self.cachedLFNSize.update((res['Value']['Successful']))
        self.logVerbose("Got size of %d files from catalog" % len(lfns))
        return S_OK(fileSizes)

    def clearCachedFileSize(self, lfns):
        """ Utility function
    """
        for lfn in [lfn for lfn in lfns if lfn in self.cachedLFNSize]:
            self.cachedLFNSize.pop(lfn)

    def getPluginParam(self, name, default=None):
        """ Get plugin parameters using specific settings or settings defined in the CS
        Caution: the type returned is that of the default value
    """
        # get the value of a parameter looking 1st in the CS
        if default != None:
            valueType = type(default)
        else:
            valueType = None
        # First look at a generic value...
        optionPath = "TransformationPlugins/%s" % (name)
        value = Operations().getValue(optionPath, None)
        self.logVerbose("Default plugin param %s: '%s'" % (optionPath, value))
        # Then look at a plugin-specific value
        optionPath = "TransformationPlugins/%s/%s" % (self.plugin, name)
        value = Operations().getValue(optionPath, value)
        self.logVerbose("Specific plugin param %s: '%s'" % (optionPath, value))
        if value != None:
            default = value
        # Finally look at a transformation-specific parameter
        value = self.params.get(name, default)
        self.logVerbose("Transformation plugin param %s: '%s'. Convert to %s" %
                        (name, value, str(valueType)))
        if valueType and type(value) is not valueType:
            if valueType is list:
                try:
                    value = ast.literal_eval(
                        value) if value and value != 'None' else []
                except Exception:
                    value = [
                        val for val in value.replace(' ', '').split(',') if val
                    ]
            elif valueType is int:
                value = int(value)
            elif valueType is float:
                value = float(value)
            elif valueType is bool:
                if value in ('False', 'No', 'None', None, 0):
                    value = False
                else:
                    value = bool(value)
            elif valueType is not str:
                self.logWarn(
                    "Unknown parameter type (%s) for %s, passed as string" %
                    (str(valueType), name))
        self.logVerbose("Final plugin param %s: '%s'" % (name, value))
        return value

    @staticmethod
    def _normaliseShares(originalShares):
        shares = originalShares.copy()
        total = 0.0
        for site in shares.keys():
            share = float(shares[site])
            shares[site] = share
            total += share
        for site in shares.keys():
            share = 100.0 * (shares[site] / total)
            shares[site] = share
        return shares

    def uniqueSEs(self, ses):
        newSEs = []
        for se in ses:
            if not self.isSameSEInList(se, newSEs):
                newSEs.append(se)
        return newSEs

    def isSameSE(self, se1, se2):
        if se1 == se2:
            return True
        for se in (se1, se2):
            if se not in self.seConfig:
                self.seConfig[se] = {}
                res = StorageElement(se).getStorageParameters('SRM2')
                if res['OK']:
                    params = res['Value']
                    for item in ('Host', 'Path'):
                        self.seConfig[se][item] = params[item].replace(
                            't1d1', 't0d1')
                else:
                    self.logError(
                        "Error getting StorageElement parameters for %s" % se,
                        res['Message'])

        return self.seConfig[se1] == self.seConfig[se2]

    def isSameSEInList(self, se1, seList):
        if se1 in seList:
            return True
        for se in seList:
            if self.isSameSE(se1, se):
                return True
        return False

    def closerSEs(self, existingSEs, targetSEs, local=False):
        """ Order the targetSEs such that the first ones are closer to existingSEs. Keep all elements in targetSEs
    """
        setTarget = set(targetSEs)
        sameSEs = set([
            se1 for se1 in setTarget for se2 in existingSEs
            if self.isSameSE(se1, se2)
        ])
        targetSEs = setTarget - set(sameSEs)
        if targetSEs:
            # Some SEs are left, look for sites
            existingSites = [
                self.dmsHelper.getLocalSiteForSE(se).get('Value')
                for se in existingSEs if not self.dmsHelper.isSEArchive(se)
            ]
            existingSites = set([site for site in existingSites if site])
            closeSEs = set([
                se for se in targetSEs if self.dmsHelper.getLocalSiteForSE(
                    se).get('Value') in existingSites
            ])
            # print existingSEs, existingSites, targetSEs, closeSEs
            otherSEs = targetSEs - closeSEs
            targetSEs = list(closeSEs)
            random.shuffle(targetSEs)
            if not local and otherSEs:
                otherSEs = list(otherSEs)
                random.shuffle(otherSEs)
                targetSEs += otherSEs
        else:
            targetSEs = []
        return (targetSEs + list(sameSEs)) if not local else targetSEs
def execute():
    """
  Parse the options and execute the script
  """
    bkQuery = dmScript.getBKQuery()
    fileType = bkQuery.getFileTypeList()
    if not set(fileType) & {'FULL.DST', 'RDST', 'SDST'}:
        gLogger.error("Please provide a reconstruction BK path")
        DIRAC.exit(1)

    from LHCbDIRAC.TransformationSystem.Client.TransformationClient import TransformationClient
    from DIRAC.DataManagementSystem.Client.DataManager import DataManager
    from DIRAC.Core.Utilities.List import breakListIntoChunks
    from LHCbDIRAC.BookkeepingSystem.Client.BookkeepingClient import BookkeepingClient
    from DIRAC.DataManagementSystem.Utilities.DMSHelpers import DMSHelpers, resolveSEGroup

    bk = BookkeepingClient()
    tr = TransformationClient()
    dm = DataManager()
    dmsHelper = DMSHelpers()

    bkQueryDict = bkQuery.getQueryDict()
    gLogger.notice("For BK Query:", str(bkQueryDict))
    progressBar = ProgressBar(1, title="Running BK query...", step=1)
    res = bk.getFilesWithMetadata(bkQueryDict)
    if not res['OK']:
        gLogger.error("Error getting files from BK", res['Message'])
        DIRAC.exit(2)

    if 'ParameterNames' in res.get('Value', {}):
        parameterNames = res['Value']['ParameterNames']
        info = res['Value']['Records']
        progressBar.endLoop("Obtained %d files" % len(info))
    else:
        gLogger.error('\nNo metadata found')
        DIRAC.exit(3)
    lfns = []
    runLFNs = {}
    for item in info:
        metadata = dict(zip(parameterNames, item))
        lfn = metadata['FileName']
        lfns.append(lfn)
        runLFNs.setdefault(metadata['RunNumber'], []).append(lfn)

    chunkSize = 1000
    progressBar = ProgressBar(len(lfns),
                              title='Getting replicas of %d files' % len(lfns),
                              chunk=chunkSize)
    replicas = {}
    errors = {}
    for lfnChunk in breakListIntoChunks(lfns, chunkSize):
        progressBar.loop()
        res = dm.getReplicas(lfnChunk, getUrl=False)
        if not res['OK']:
            errors.setdefault(res['Message'], []).extend(lfnChunk)
        else:
            replicas.update(res['Value']['Successful'])
            for lfn, error in res['Value']['Failed'].iteritems():
                errors.setdefault(error, []).append(lfn)
    progressBar.endLoop()
    for error, lfns in errors.iteritems():
        gLogger.error(error, 'for %d files' % len(lfns))

    tier1RDST = set(resolveSEGroup('Tier1-RDST'))
    setOK = 0
    errors = {}
    progressBar = ProgressBar(len(runLFNs),
                              title='Defining destination for %d runs' %
                              len(runLFNs),
                              step=10)
    for run, lfns in runLFNs.iteritems():
        progressBar.loop()
        res = tr.getDestinationForRun(run)
        if res.get('Value'):
            errors.setdefault('Destination already set', []).append(str(run))
            continue
        # print 'Run', run, len( lfns ), 'Files', lfns[:3]
        seCounts = {}
        for lfn in lfns:
            for se in tier1RDST.intersection(replicas.get(lfn, [])):
                seCounts[se] = seCounts.setdefault(se, 0) + 1
        # print seCounts
        maxi = 0
        seMax = None
        for se, count in seCounts.iteritems():
            if count > maxi:
                seMax = se
                maxi = count
        if not seMax:
            errors.setdefault('No SE found, use CERN-RDST',
                              []).append(str(run))
            seMax = 'CERN-RDST'
        # SE found, get its site
        res = dmsHelper.getLocalSiteForSE(seMax)
        if res['OK']:
            site = res['Value']
            res = tr.setDestinationForRun(run, site)
            if not res['OK']:
                errors.setdefault(res['Message'], []).append(str(run))
            else:
                setOK += 1
    progressBar.endLoop('Successfully set destination for %d runs' % setOK)
    for error, runs in errors.iteritems():
        gLogger.error(error, 'for runs %s' % ','.join(runs))
def getFilesToStage( lfnList, jobState = None, checkOnlyTapeSEs = None, jobLog = None ):
  """ Utility that returns out of a list of LFNs those files that are offline,
      and those for which at least one copy is online
  """
  if not lfnList:
    return S_OK( {'onlineLFNs':[], 'offlineLFNs': {}, 'failedLFNs':[], 'absentLFNs':{}} )

  dm = DataManager()
  if isinstance( lfnList, basestring ):
    lfnList = [lfnList]

  lfnListReplicas = dm.getReplicasForJobs( lfnList, getUrl = False )
  if not lfnListReplicas['OK']:
    return lfnListReplicas

  offlineLFNsDict = {}
  onlineLFNs = {}
  offlineLFNs = {}
  absentLFNs = {}
  failedLFNs = set()
  if lfnListReplicas['Value']['Failed']:
    # Check if files are not existing
    for lfn, reason in lfnListReplicas['Value']['Failed'].iteritems():
      # FIXME: awful check until FC returns a proper error
      if cmpError( reason, errno.ENOENT ) or 'No such file' in reason:
        # The file doesn't exist, job must be Failed
        # FIXME: it is not possible to return here an S_ERROR(), return the message only
        absentLFNs[lfn] = S_ERROR( errno.ENOENT, 'File not in FC' )['Message']
    if absentLFNs:
      return S_OK({'onlineLFNs': list(onlineLFNs),
                   'offlineLFNs': offlineLFNsDict,
                   'failedLFNs': list(failedLFNs),
                   'absentLFNs': absentLFNs})
    return S_ERROR( "Failures in getting replicas" )

  lfnListReplicas = lfnListReplicas['Value']['Successful']
  # If a file is reported here at a tape SE, it is not at a disk SE as we use disk in priority
  # We shall check all file anyway in order to make sure they exist
  seToLFNs = dict()
  for lfn, ses in lfnListReplicas.iteritems():
    for se in ses:
      seToLFNs.setdefault( se, list() ).append( lfn )

  if seToLFNs:
    if jobState:
      # Get user name and group from the job state
      userName = jobState.getAttribute( 'Owner' )
      if not userName[ 'OK' ]:
        return userName
      userName = userName['Value']

      userGroup = jobState.getAttribute( 'OwnerGroup' )
      if not userGroup[ 'OK' ]:
        return userGroup
      userGroup = userGroup['Value']
    else:
      userName = None
      userGroup = None
    # Check whether files are Online or Offline, or missing at SE
    result = _checkFilesToStage( seToLFNs, onlineLFNs, offlineLFNs, absentLFNs,  # pylint: disable=unexpected-keyword-arg
                                 checkOnlyTapeSEs = checkOnlyTapeSEs, jobLog = jobLog,
                                 proxyUserName = userName,
                                 proxyUserGroup = userGroup,
                                 executionLock = True )

    if not result['OK']:
      return result
    failedLFNs = set( lfnList ) - set( onlineLFNs ) - set( offlineLFNs ) - set( absentLFNs )

    # Get the online SEs
    dmsHelper = DMSHelpers()
    onlineSEs = set( se for ses in onlineLFNs.values() for se in ses )
    onlineSites = set( dmsHelper.getLocalSiteForSE( se ).get( 'Value' ) for se in onlineSEs ) - {None}
    for lfn in offlineLFNs:
      ses = offlineLFNs[lfn]
      if len( ses ) == 1:
        # No choice, let's go
        offlineLFNsDict.setdefault( ses[0], list() ).append( lfn )
        continue
      # Try and get an SE at a site already with online files
      found = False
      if onlineSites:
        # If there is at least one online site, select one
        for se in ses:
          site = dmsHelper.getLocalSiteForSE( se )
          if site['OK']:
            if site['Value'] in onlineSites:
              offlineLFNsDict.setdefault( se, list() ).append( lfn )
              found = True
              break
      # No online site found in common, select randomly
      if not found:
        offlineLFNsDict.setdefault( random.choice( ses ), list() ).append( lfn )

  return S_OK({'onlineLFNs': list(onlineLFNs),
               'offlineLFNs': offlineLFNsDict,
               'failedLFNs': list(failedLFNs),
               'absentLFNs': absentLFNs,
               'onlineSites': onlineSites})