Esempio n. 1
0
def filterReplicas(opFile, logger=None, dataManager=None):
    """ filter out banned/invalid source SEs """

    if logger is None:
        logger = gLogger
    if dataManager is None:
        dataManager = DataManager()

    log = logger.getSubLogger("filterReplicas")
    result = defaultdict(list)

    replicas = dataManager.getActiveReplicas(opFile.LFN, getUrl=False)
    if not replicas["OK"]:
        log.error('Failed to get active replicas', replicas["Message"])
        return replicas
    reNotExists = re.compile(r".*such file.*")
    replicas = replicas["Value"]
    failed = replicas["Failed"].get(opFile.LFN, "")
    if reNotExists.match(failed.lower()):
        opFile.Status = "Failed"
        opFile.Error = failed
        return S_ERROR(failed)

    replicas = replicas["Successful"].get(opFile.LFN, {})
    noReplicas = False
    if not replicas:
        allReplicas = dataManager.getReplicas(opFile.LFN, getUrl=False)
        if allReplicas['OK']:
            allReplicas = allReplicas['Value']['Successful'].get(
                opFile.LFN, {})
            if not allReplicas:
                result['NoReplicas'].append(None)
                noReplicas = True
            else:
                # There are replicas but we cannot get metadata because the replica is not active
                result['NoActiveReplicas'] += list(allReplicas)
            log.verbose(
                "File has no%s replica in File Catalog" %
                ('' if noReplicas else ' active'), opFile.LFN)
        else:
            return allReplicas

    if not opFile.Checksum or hexAdlerToInt(opFile.Checksum) is False:
        # Set Checksum to FC checksum if not set in the request
        fcMetadata = FileCatalog().getFileMetadata(opFile.LFN)
        fcChecksum = fcMetadata.get('Value',
                                    {}).get('Successful',
                                            {}).get(opFile.LFN,
                                                    {}).get('Checksum')
        # Replace opFile.Checksum if it doesn't match a valid FC checksum
        if fcChecksum:
            if hexAdlerToInt(fcChecksum) is not False:
                opFile.Checksum = fcChecksum
                opFile.ChecksumType = fcMetadata['Value']['Successful'][
                    opFile.LFN].get('ChecksumType', 'Adler32')
            else:
                opFile.Checksum = None

    # If no replica was found, return what we collected as information
    if not replicas:
        return S_OK(result)

    for repSEName in replicas:
        repSEMetadata = StorageElement(repSEName).getFileMetadata(opFile.LFN)
        error = repSEMetadata.get(
            'Message',
            repSEMetadata.get('Value', {}).get('Failed', {}).get(opFile.LFN))
        if error:
            log.warn(
                'unable to get metadata at %s for %s' %
                (repSEName, opFile.LFN), error.replace('\n', ''))
            if 'File does not exist' in error:
                result['NoReplicas'].append(repSEName)
            else:
                result["NoMetadata"].append(repSEName)
        elif not noReplicas:
            repSEMetadata = repSEMetadata['Value']['Successful'][opFile.LFN]

            seChecksum = hexAdlerToInt(repSEMetadata.get("Checksum"))
            # As from here seChecksum is an integer or False, not a hex string!
            if seChecksum is False and opFile.Checksum:
                result['NoMetadata'].append(repSEName)
            elif not seChecksum and opFile.Checksum:
                opFile.Checksum = None
                opFile.ChecksumType = None
            elif seChecksum and (not opFile.Checksum
                                 or opFile.Checksum == 'False'):
                # Use the SE checksum (convert to hex) and force type to be Adler32
                opFile.Checksum = intAdlerToHex(seChecksum)
                opFile.ChecksumType = 'Adler32'
            if not opFile.Checksum or not seChecksum or compareAdler(
                    intAdlerToHex(seChecksum), opFile.Checksum):
                # # All checksums are OK
                result["Valid"].append(repSEName)
            else:
                log.warn(" %s checksum mismatch, FC: '%s' @%s: '%s'" %
                         (opFile.LFN, opFile.Checksum, repSEName,
                          intAdlerToHex(seChecksum)))
                result["Bad"].append(repSEName)
        else:
            # If a replica was found somewhere, don't set the file as no replicas
            result['NoReplicas'] = []

    return S_OK(result)
Esempio n. 2
0
def filterReplicas( opFile, logger = None, dataManager = None ):
  """ filter out banned/invalid source SEs """

  if logger is None:
    logger = gLogger
  if dataManager is None:
    dataManager = DataManager()

  log = logger.getSubLogger( "filterReplicas" )
  ret = { "Valid" : [], "NoMetadata" : [], "Bad" : [], 'NoReplicas':[], 'NoPFN':[] }

  replicas = dataManager.getActiveReplicas( opFile.LFN )
  if not replicas["OK"]:
    log.error( 'Failed to get active replicas', replicas["Message"] )
    return replicas
  reNotExists = re.compile( r".*such file.*" )
  replicas = replicas["Value"]
  failed = replicas["Failed"].get( opFile.LFN , "" )
  if reNotExists.match( failed.lower() ):
    opFile.Status = "Failed"
    opFile.Error = failed
    return S_ERROR( failed )

  replicas = replicas["Successful"].get( opFile.LFN, {} )
  noReplicas = False
  if not replicas:
    allReplicas = dataManager.getReplicas( opFile.LFN )
    if allReplicas['OK']:
      allReplicas = allReplicas['Value']['Successful'].get( opFile.LFN, {} )
      if not allReplicas:
        ret['NoReplicas'].append( None )
        noReplicas = True
      else:
        # We try inactive replicas to see if maybe the file doesn't exist at all
        replicas = allReplicas
      log.warn( "File has no%s replica in File Catalog" % ( '' if noReplicas else ' active' ), opFile.LFN )
    else:
      return allReplicas

  if not opFile.Checksum:
    # Set Checksum to FC checksum if not set in the request
    fcMetadata = FileCatalog().getFileMetadata( opFile.LFN )
    fcChecksum = fcMetadata.get( 'Value', {} ).get( 'Successful', {} ).get( opFile.LFN, {} ).get( 'Checksum' )
    # Replace opFile.Checksum if it doesn't match a valid FC checksum
    if fcChecksum:
      opFile.Checksum = fcChecksum
      opFile.ChecksumType = fcMetadata['Value']['Successful'][opFile.LFN].get( 'ChecksumType', 'Adler32' )

  for repSEName in replicas:
    repSEMetadata = StorageElement( repSEName ).getFileMetadata( opFile.LFN )
    error = repSEMetadata.get( 'Message', repSEMetadata.get( 'Value', {} ).get( 'Failed', {} ).get( opFile.LFN ) )
    if error:
      log.warn( 'unable to get metadata at %s for %s' % ( repSEName, opFile.LFN ), error.replace( '\n', '' ) )
      if 'File does not exist' in error:
        ret['NoReplicas'].append( repSEName )
      else:
        ret["NoMetadata"].append( repSEName )
    elif not noReplicas:
      repSEMetadata = repSEMetadata['Value']['Successful'][opFile.LFN]

      seChecksum = repSEMetadata.get( "Checksum" )
      if not seChecksum and opFile.Checksum:
        opFile.Checksum = None
        opFile.ChecksumType = None
      elif seChecksum and not opFile.Checksum:
        opFile.Checksum = seChecksum
      if not opFile.Checksum or not seChecksum or compareAdler( seChecksum, opFile.Checksum ):
        # # All checksums are OK
        ret["Valid"].append( repSEName )
      else:
        log.warn( " %s checksum mismatch, FC: '%s' @%s: '%s'" % ( opFile.LFN,
                                                              opFile.Checksum,
                                                              repSEName,
                                                              seChecksum ) )
        ret["Bad"].append( repSEName )
    else:
      # If a replica was found somewhere, don't set the file as no replicas
      ret['NoReplicas'] = []

  return S_OK( ret )
Esempio n. 3
0
def filterReplicas( opFile, logger = None, dataManager = None, seCache = None ):
  """ filter out banned/invalid source SEs """

  if not logger:
    logger = gLogger
  if not dataManager:
    dataManager = DataManager()
  if not seCache:
    seCache = {}

  log = logger.getSubLogger( "filterReplicas" )
  ret = { "Valid" : [], "NoMetadata" : [], "Bad" : [], 'NoReplicas':[], 'NoPFN':[] }

  replicas = dataManager.getActiveReplicas( opFile.LFN )
  if not replicas["OK"]:
    log.error( replicas["Message"] )
    return replicas
  reNotExists = re.compile( "not such file or directory" )
  replicas = replicas["Value"]
  failed = replicas["Failed"].get( opFile.LFN , "" )
  if reNotExists.match( failed.lower() ):
    opFile.Status = "Failed"
    opFile.Error = failed
    return S_ERROR( failed )

  replicas = replicas["Successful"].get( opFile.LFN, {} )

  for repSEName in replicas:

    repSE = seCache[repSEName] if repSEName in seCache else \
            seCache.setdefault( repSEName, StorageElement( repSEName ) )

    pfn = repSE.getPfnForLfn( opFile.LFN )
    if not pfn["OK"] or opFile.LFN not in pfn['Value']['Successful']:
      log.warn( "unable to create pfn for %s lfn at %s: %s" % ( opFile.LFN,
                                                                repSEName,
                                                                pfn.get( 'Message', pfn.get( 'Value', {} ).get( 'Failed', {} ).get( opFile.LFN ) ) ) )
      ret["NoPFN"].append( repSEName )
    else:
      pfn = pfn["Value"]['Successful'][ opFile.LFN ]

      repSEMetadata = repSE.getFileMetadata( pfn )
      error = repSEMetadata.get( 'Message', repSEMetadata.get( 'Value', {} ).get( 'Failed', {} ).get( pfn ) )
      if error:
        log.warn( 'unable to get metadata at %s for %s' % ( repSEName, opFile.LFN ), error.replace( '\n', '' ) )
        if 'File does not exist' in error:
          ret['NoReplicas'].append( repSEName )
        else:
          ret["NoMetadata"].append( repSEName )
      else:
        repSEMetadata = repSEMetadata['Value']['Successful'][pfn]

        seChecksum = repSEMetadata.get( "Checksum" )
        if opFile.Checksum and seChecksum and not compareAdler( seChecksum, opFile.Checksum ) :
          # The checksum in the request may be wrong, check with FC
          fcMetadata = FileCatalog().getFileMetadata( opFile.LFN )
          fcChecksum = fcMetadata.get( 'Value', {} ).get( 'Successful', {} ).get( opFile.LFN, {} ).get( 'Checksum' )
          if fcChecksum and fcChecksum != opFile.Checksum and compareAdler( fcChecksum , seChecksum ):
            opFile.Checksum = fcChecksum
            ret['Valid'].append( repSEName )
          else:
            log.warn( " %s checksum mismatch, request: %s @%s: %s" % ( opFile.LFN,
                                                                       opFile.Checksum,
                                                                       repSEName,
                                                                       seChecksum ) )
            ret["Bad"].append( repSEName )
        else:
          # # if we're here repSE is OK
          ret["Valid"].append( repSEName )

  return S_OK( ret )
Esempio n. 4
0
def filterReplicas(opFile, logger=None, dataManager=None):
  """ filter out banned/invalid source SEs """

  if logger is None:
    logger = gLogger
  if dataManager is None:
    dataManager = DataManager()

  log = logger.getSubLogger("filterReplicas")
  result = defaultdict(list)

  replicas = dataManager.getActiveReplicas(opFile.LFN, getUrl=False)
  if not replicas["OK"]:
    log.error('Failed to get active replicas', replicas["Message"])
    return replicas
  reNotExists = re.compile(r".*such file.*")
  replicas = replicas["Value"]
  failed = replicas["Failed"].get(opFile.LFN, "")
  if reNotExists.match(failed.lower()):
    opFile.Status = "Failed"
    opFile.Error = failed
    return S_ERROR(failed)

  replicas = replicas["Successful"].get(opFile.LFN, {})
  noReplicas = False
  if not replicas:
    allReplicas = dataManager.getReplicas(opFile.LFN, getUrl=False)
    if allReplicas['OK']:
      allReplicas = allReplicas['Value']['Successful'].get(opFile.LFN, {})
      if not allReplicas:
        result['NoReplicas'].append(None)
        noReplicas = True
      else:
        # There are replicas but we cannot get metadata because the replica is not active
        result['NoActiveReplicas'] += list(allReplicas)
      log.verbose("File has no%s replica in File Catalog" % ('' if noReplicas else ' active'), opFile.LFN)
    else:
      return allReplicas

  if not opFile.Checksum or hexAdlerToInt(opFile.Checksum) is False:
    # Set Checksum to FC checksum if not set in the request
    fcMetadata = FileCatalog().getFileMetadata(opFile.LFN)
    fcChecksum = fcMetadata.get(
        'Value',
        {}).get(
        'Successful',
        {}).get(
        opFile.LFN,
        {}).get('Checksum')
    # Replace opFile.Checksum if it doesn't match a valid FC checksum
    if fcChecksum:
      if hexAdlerToInt(fcChecksum) is not False:
        opFile.Checksum = fcChecksum
        opFile.ChecksumType = fcMetadata['Value']['Successful'][opFile.LFN].get('ChecksumType', 'Adler32')
      else:
        opFile.Checksum = None

  # If no replica was found, return what we collected as information
  if not replicas:
    return S_OK(result)

  for repSEName in replicas:
    repSEMetadata = StorageElement(repSEName).getFileMetadata(opFile.LFN)
    error = repSEMetadata.get('Message', repSEMetadata.get('Value', {}).get('Failed', {}).get(opFile.LFN))
    if error:
      log.warn('unable to get metadata at %s for %s' % (repSEName, opFile.LFN), error.replace('\n', ''))
      if 'File does not exist' in error:
        result['NoReplicas'].append(repSEName)
      else:
        result["NoMetadata"].append(repSEName)
    elif not noReplicas:
      repSEMetadata = repSEMetadata['Value']['Successful'][opFile.LFN]

      seChecksum = hexAdlerToInt(repSEMetadata.get("Checksum"))
      # As from here seChecksum is an integer or False, not a hex string!
      if seChecksum is False and opFile.Checksum:
        result['NoMetadata'].append(repSEName)
      elif not seChecksum and opFile.Checksum:
        opFile.Checksum = None
        opFile.ChecksumType = None
      elif seChecksum and (not opFile.Checksum or opFile.Checksum == 'False'):
        # Use the SE checksum (convert to hex) and force type to be Adler32
        opFile.Checksum = intAdlerToHex(seChecksum)
        opFile.ChecksumType = 'Adler32'
      if not opFile.Checksum or not seChecksum or compareAdler(
              intAdlerToHex(seChecksum), opFile.Checksum):
        # # All checksums are OK
        result["Valid"].append(repSEName)
      else:
        log.warn(" %s checksum mismatch, FC: '%s' @%s: '%s'" %
                 (opFile.LFN, opFile.Checksum, repSEName, intAdlerToHex(seChecksum)))
        result["Bad"].append(repSEName)
    else:
      # If a replica was found somewhere, don't set the file as no replicas
      result['NoReplicas'] = []

  return S_OK(result)
Esempio n. 5
0
def filterReplicas(opFile, logger=None, dataManager=None, seCache=None):
    """ filter out banned/invalid source SEs """

    if not logger:
        logger = gLogger
    if not dataManager:
        dataManager = DataManager()
    if not seCache:
        seCache = {}

    log = logger.getSubLogger("filterReplicas")
    ret = {
        "Valid": [],
        "NoMetadata": [],
        "Bad": [],
        'NoReplicas': [],
        'NoPFN': []
    }

    replicas = dataManager.getActiveReplicas(opFile.LFN)
    if not replicas["OK"]:
        log.error(replicas["Message"])
        return replicas
    reNotExists = re.compile("not such file or directory")
    replicas = replicas["Value"]
    failed = replicas["Failed"].get(opFile.LFN, "")
    if reNotExists.match(failed.lower()):
        opFile.Status = "Failed"
        opFile.Error = failed
        return S_ERROR(failed)

    replicas = replicas["Successful"].get(opFile.LFN, {})

    for repSEName in replicas:

        repSE = seCache[repSEName] if repSEName in seCache else \
                seCache.setdefault( repSEName, StorageElement( repSEName ) )

        pfn = repSE.getPfnForLfn(opFile.LFN)
        if not pfn["OK"] or opFile.LFN not in pfn['Value']['Successful']:
            log.warn(
                "unable to create pfn for %s lfn at %s: %s" %
                (opFile.LFN, repSEName,
                 pfn.get(
                     'Message',
                     pfn.get('Value', {}).get('Failed', {}).get(opFile.LFN))))
            ret["NoPFN"].append(repSEName)
        else:
            pfn = pfn["Value"]['Successful'][opFile.LFN]

            repSEMetadata = repSE.getFileMetadata(pfn)
            error = repSEMetadata.get(
                'Message',
                repSEMetadata.get('Value', {}).get('Failed', {}).get(pfn))
            if error:
                log.warn(
                    'unable to get metadata at %s for %s' %
                    (repSEName, opFile.LFN), error.replace('\n', ''))
                if 'File does not exist' in error:
                    ret['NoReplicas'].append(repSEName)
                else:
                    ret["NoMetadata"].append(repSEName)
            else:
                repSEMetadata = repSEMetadata['Value']['Successful'][pfn]

                seChecksum = repSEMetadata.get("Checksum")
                if opFile.Checksum and seChecksum and not compareAdler(
                        seChecksum, opFile.Checksum):
                    # The checksum in the request may be wrong, check with FC
                    fcMetadata = FileCatalog().getFileMetadata(opFile.LFN)
                    fcChecksum = fcMetadata.get('Value', {}).get(
                        'Successful', {}).get(opFile.LFN, {}).get('Checksum')
                    if fcChecksum and fcChecksum != opFile.Checksum and compareAdler(
                            fcChecksum, seChecksum):
                        opFile.Checksum = fcChecksum
                        ret['Valid'].append(repSEName)
                    else:
                        log.warn(" %s checksum mismatch, request: %s @%s: %s" %
                                 (opFile.LFN, opFile.Checksum, repSEName,
                                  seChecksum))
                        ret["Bad"].append(repSEName)
                else:
                    # # if we're here repSE is OK
                    ret["Valid"].append(repSEName)

    return S_OK(ret)
Esempio n. 6
0
def filterReplicas( opFile, logger = None, dataManager = None ):
  """ filter out banned/invalid source SEs """

  if logger is None:
    logger = gLogger
  if dataManager is None:
    dataManager = DataManager()

  log = logger.getSubLogger( "filterReplicas" )
  ret = { "Valid" : [], "NoMetadata" : [], "Bad" : [], 'NoReplicas':[], 'NoPFN':[] }

  replicas = dataManager.getActiveReplicas( opFile.LFN )
  if not replicas["OK"]:
    log.error( 'Failed to get active replicas', replicas["Message"] )
    return replicas
  reNotExists = re.compile( r".*such file.*" )
  replicas = replicas["Value"]
  failed = replicas["Failed"].get( opFile.LFN , "" )
  if reNotExists.match( failed.lower() ):
    opFile.Status = "Failed"
    opFile.Error = failed
    return S_ERROR( failed )

  replicas = replicas["Successful"].get( opFile.LFN, {} )

  if not opFile.Checksum:
    # Set Checksum to FC checksum if not set in the request
    fcMetadata = FileCatalog().getFileMetadata( opFile.LFN )
    fcChecksum = fcMetadata.get( 'Value', {} ).get( 'Successful', {} ).get( opFile.LFN, {} ).get( 'Checksum', '' )
    # Replace opFile.Checksum if it doesn't match a valid FC checksum
    if fcChecksum:
      opFile.Checksum = fcChecksum
      opFile.ChecksumType = fcMetadata['Value']['Successful'][opFile.LFN].get( 'ChecksumType', 'Adler32' )

  for repSEName in replicas:

    repSE = StorageElement( repSEName )


    repSEMetadata = repSE.getFileMetadata( opFile.LFN )
    error = repSEMetadata.get( 'Message', repSEMetadata.get( 'Value', {} ).get( 'Failed', {} ).get( opFile.LFN ) )
    if error:
      log.warn( 'unable to get metadata at %s for %s' % ( repSEName, opFile.LFN ), error.replace( '\n', '' ) )
      if 'File does not exist' in error:
        ret['NoReplicas'].append( repSEName )
      else:
        ret["NoMetadata"].append( repSEName )
    else:
      repSEMetadata = repSEMetadata['Value']['Successful'][opFile.LFN]

      seChecksum = repSEMetadata.get( "Checksum" )
      if ( opFile.Checksum and seChecksum and compareAdler( seChecksum, opFile.Checksum ) ) or\
         ( not opFile.Checksum and not seChecksum ):
        # # All checksums are OK
        ret["Valid"].append( repSEName )
      else:
        log.warn( " %s checksum mismatch, FC: '%s' @%s: '%s'" % ( opFile.LFN,
                                                              opFile.Checksum,
                                                              repSEName,
                                                              seChecksum ) )
        ret["Bad"].append( repSEName )

  return S_OK( ret )
Esempio n. 7
0
def filterReplicas(opFile, logger=None, dataManager=None):
    """ filter out banned/invalid source SEs """

    if logger is None:
        logger = gLogger
    if dataManager is None:
        dataManager = DataManager()

    log = logger.getSubLogger("filterReplicas")
    ret = {"Valid": [], "NoMetadata": [], "Bad": [], "NoReplicas": [], "NoPFN": []}

    replicas = dataManager.getActiveReplicas(opFile.LFN)
    if not replicas["OK"]:
        log.error("Failed to get active replicas", replicas["Message"])
        return replicas
    reNotExists = re.compile(r".*such file.*")
    replicas = replicas["Value"]
    failed = replicas["Failed"].get(opFile.LFN, "")
    if reNotExists.match(failed.lower()):
        opFile.Status = "Failed"
        opFile.Error = failed
        return S_ERROR(failed)

    replicas = replicas["Successful"].get(opFile.LFN, {})
    noReplicas = False
    if not replicas:
        allReplicas = dataManager.getReplicas(opFile.LFN)
        if allReplicas["OK"]:
            allReplicas = allReplicas["Value"]["Successful"].get(opFile.LFN, {})
            if not allReplicas:
                ret["NoReplicas"].append(None)
                noReplicas = True
            else:
                # We try inactive replicas to see if maybe the file doesn't exist at all
                replicas = allReplicas
            log.warn("File has no%s replica in File Catalog" % ("" if noReplicas else " active"), opFile.LFN)
        else:
            return allReplicas

    if not opFile.Checksum:
        # Set Checksum to FC checksum if not set in the request
        fcMetadata = FileCatalog().getFileMetadata(opFile.LFN)
        fcChecksum = fcMetadata.get("Value", {}).get("Successful", {}).get(opFile.LFN, {}).get("Checksum")
        # Replace opFile.Checksum if it doesn't match a valid FC checksum
        if fcChecksum:
            opFile.Checksum = fcChecksum
            opFile.ChecksumType = fcMetadata["Value"]["Successful"][opFile.LFN].get("ChecksumType", "Adler32")

    for repSEName in replicas:
        repSEMetadata = StorageElement(repSEName).getFileMetadata(opFile.LFN)
        error = repSEMetadata.get("Message", repSEMetadata.get("Value", {}).get("Failed", {}).get(opFile.LFN))
        if error:
            log.warn("unable to get metadata at %s for %s" % (repSEName, opFile.LFN), error.replace("\n", ""))
            if "File does not exist" in error:
                ret["NoReplicas"].append(repSEName)
            else:
                ret["NoMetadata"].append(repSEName)
        elif not noReplicas:
            repSEMetadata = repSEMetadata["Value"]["Successful"][opFile.LFN]

            seChecksum = repSEMetadata.get("Checksum")
            if not seChecksum and opFile.Checksum:
                opFile.Checksum = None
                opFile.ChecksumType = None
            elif seChecksum and not opFile.Checksum:
                opFile.Checksum = seChecksum
            if not opFile.Checksum or not seChecksum or compareAdler(seChecksum, opFile.Checksum):
                # # All checksums are OK
                ret["Valid"].append(repSEName)
            else:
                log.warn(
                    " %s checksum mismatch, FC: '%s' @%s: '%s'" % (opFile.LFN, opFile.Checksum, repSEName, seChecksum)
                )
                ret["Bad"].append(repSEName)
        else:
            # If a replica was found somewhere, don't set the file as no replicas
            ret["NoReplicas"] = []

    return S_OK(ret)
Esempio n. 8
0
def filterReplicas( opFile, logger = None, dataManager = None ):
  """ filter out banned/invalid source SEs """

  if logger is None:
    logger = gLogger
  if dataManager is None:
    dataManager = DataManager()

  log = logger.getSubLogger( "filterReplicas" )
  ret = { "Valid" : [], "NoMetadata" : [], "Bad" : [], 'NoReplicas':[], 'NoPFN':[] }

  replicas = dataManager.getActiveReplicas( opFile.LFN )
  if not replicas["OK"]:
    log.error( 'Failed to get active replicas', replicas["Message"] )
    return replicas
  reNotExists = re.compile( r".*such file.*" )
  replicas = replicas["Value"]
  failed = replicas["Failed"].get( opFile.LFN , "" )
  if reNotExists.match( failed.lower() ):
    opFile.Status = "Failed"
    opFile.Error = failed
    return S_ERROR( failed )

  replicas = replicas["Successful"].get( opFile.LFN, {} )
  noReplicas = False
  if not replicas:
    allReplicas = dataManager.getReplicas( opFile.LFN )
    if allReplicas['OK']:
      allReplicas = allReplicas['Value']['Successful'].get( opFile.LFN, {} )
      if not allReplicas:
        ret['NoReplicas'].append( None )
        noReplicas = True
      else:
        # We try inactive replicas to see if maybe the file doesn't exist at all
        replicas = allReplicas
      log.warn( "File has no%s replica in File Catalog" % ( '' if noReplicas else ' active' ), opFile.LFN )
    else:
      return allReplicas

  if not opFile.Checksum or hexAdlerToInt( opFile.Checksum ) == False:
    # Set Checksum to FC checksum if not set in the request
    fcMetadata = FileCatalog().getFileMetadata( opFile.LFN )
    fcChecksum = fcMetadata.get( 'Value', {} ).get( 'Successful', {} ).get( opFile.LFN, {} ).get( 'Checksum' )
    # Replace opFile.Checksum if it doesn't match a valid FC checksum
    if fcChecksum:
      if hexAdlerToInt( fcChecksum ) != False:
        opFile.Checksum = fcChecksum
        opFile.ChecksumType = fcMetadata['Value']['Successful'][opFile.LFN].get( 'ChecksumType', 'Adler32' )
      else:
        opFile.Checksum = None

  for repSEName in replicas:
    repSEMetadata = StorageElement( repSEName ).getFileMetadata( opFile.LFN )
    error = repSEMetadata.get( 'Message', repSEMetadata.get( 'Value', {} ).get( 'Failed', {} ).get( opFile.LFN ) )
    if error:
      log.warn( 'unable to get metadata at %s for %s' % ( repSEName, opFile.LFN ), error.replace( '\n', '' ) )
      if 'File does not exist' in error:
        ret['NoReplicas'].append( repSEName )
      else:
        ret["NoMetadata"].append( repSEName )
    elif not noReplicas:
      repSEMetadata = repSEMetadata['Value']['Successful'][opFile.LFN]

      seChecksum = hexAdlerToInt( repSEMetadata.get( "Checksum" ) )
      if seChecksum == False and opFile.Checksum:
        ret['NoMetadata'].append( repSEName )
      elif not seChecksum and opFile.Checksum:
        opFile.Checksum = None
        opFile.ChecksumType = None
      elif seChecksum and ( not opFile.Checksum or opFile.Checksum == 'False' ):
        # Use the SE checksum and force type to be Adler32
        opFile.Checksum = seChecksum
        opFile.ChecksumType = 'Adler32'
      if not opFile.Checksum or not seChecksum or compareAdler( seChecksum, opFile.Checksum ):
        # # All checksums are OK
        ret["Valid"].append( repSEName )
      else:
        log.warn( " %s checksum mismatch, FC: '%s' @%s: '%s'" % ( opFile.LFN,
                                                              opFile.Checksum,
                                                              repSEName,
                                                              seChecksum ) )
        ret["Bad"].append( repSEName )
    else:
      # If a replica was found somewhere, don't set the file as no replicas
      ret['NoReplicas'] = []

  return S_OK( ret )
Esempio n. 9
0
def scanPopularity(since, getAllDatasets, topDirectory='/lhcb', csvFile=None):
    """
  That function does the job to cache the directories, get the corresponding datasets and join with the popularity
  """
    # Reset global variables

    bkPathForDir.clear()
    cachedInvisible.clear()
    prodForBKPath.clear()
    bkPathUsage.clear()
    processingPass.clear()
    bkPathPopularity.clear()
    physicalStorageUsage.clear()
    datasetStorage.clear()
    for infoType in storageTypes:
        datasetStorage[infoType] = set()

    # set of used directories
    usedDirectories = set()
    usedSEs = {}
    binSize = 'week'
    nbBins = int((since + 6) / 7)
    since = 7 * nbBins

    from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
    ignoreDirectories = Operations().getValue(
        'DataManagement/PopularityIgnoreDirectories', [
            'user', 'test', 'debug', 'dataquality', 'software', 'database',
            'swtest', 'certification', 'validation'
        ])
    nowBin = getTimeBin(datetime.now() - timedelta(days=1))
    notCached = set()

    if getAllDatasets:
        # Get list of directories
        startTime = time.time()
        res = FileCatalog().listDirectory(topDirectory)
        if not res['OK']:
            gLogger.fatal("Cannot get list of directories", res['Message'])
            DIRAC.exit(1)
        directories = set(
            subDir
            for subDir in res['Value']['Successful'][topDirectory]['SubDirs']
            if subDir.split('/')[2] not in ignoreDirectories and 'RAW' not in
            subDir and 'RDST' not in subDir and 'SDST' not in subDir)
        allDirectoriesSet = set()
        for baseDir in directories:
            allDirectoriesSet.update(getPhysicalUsage(baseDir))
        gLogger.always(
            "Obtained %d directories storage usage information in %.1f seconds)"
            % (len(allDirectoriesSet), time.time() - startTime))
        cacheDirectories(allDirectoriesSet)

    # Get the popularity raw information for the specified number of days
    if since:
        entries = 0
        now = datetime.now()
        endTime = datetime(now.year, now.month, now.day, 0, 0, 0)
        startTime = endTime
        gLogger.always('Get popularity day-by-day')
        stTime = time.time()
        for _i in range(since):
            endTime = startTime
            startTime = endTime - timedelta(days=1)
            endTimeQuery = endTime.isoformat()
            startTimeQuery = startTime.isoformat()
            status = 'Used'
            # Getting the popularity with 10 retries for the given day
            for _i in range(10):
                res = duClient.getDataUsageSummary(startTimeQuery,
                                                   endTimeQuery,
                                                   status,
                                                   timeout=7200)
                if res['OK']:
                    break
                gLogger.error("Error getting popularity entries, retrying...",
                              res['Message'])
            if not res['OK']:
                gLogger.fatal("Error getting popularity entries",
                              res['Message'])
                DIRAC.exit(1)
            val = res['Value']
            entries += len(val)

            # Get information on useful directories
            directories = set(row[1] for row in val
                              if row[1].split('/')[2] not in ignoreDirectories)
            usedDirectories.update(directories)
            cacheDirectories(directories)

            # Get information in bins (day or week)
            for _rowId, dirLfn, se, count, insertTime in val:
                if dirLfn not in directories:
                    # print rowId, dirLfn, count, insertTime, 'ignored'
                    continue
                # get the binNumber (day or week)
                binNumber = getTimeBin(insertTime)
                bkPath = bkPathForDir.get(dirLfn)
                if not bkPath:
                    if dirLfn not in notCached:
                        notCached.add(dirLfn)
                        gLogger.error('Directory %s was not cached' % dirLfn)
                    bkPath = 'Unknown-' + dirLfn
                bkPathPopularity[
                    bkPath][binNumber] = bkPathPopularity.setdefault(
                        bkPath, {}).setdefault(binNumber, 0) + count
                usedSEs.setdefault(bkPath, set()).add(se)

        gLogger.always(
            "\n=============================================================")
        gLogger.always(
            "Retrieved %d entries from Popularity table in %.1f seconds" %
            (entries, time.time() - stTime))
        gLogger.always('Found %d datasets used since %d days' %
                       (len(bkPathPopularity), since))
        counters = {}
        strangeBKPaths = set(
            bkPath for bkPath in bkPathPopularity
            if not bkPathUsage.get(bkPath, {}).get('LFN', (0, 0))[0])
        if strangeBKPaths:
            gLogger.always('%d used datasets do not have an LFN count:' %
                           len(strangeBKPaths))
            gLogger.always('\n'.join("%s : %s" %
                                     (bkPath, str(bkPathUsage.get(bkPath, {})))
                                     for bkPath in strangeBKPaths))
        gLogger.always('\nDataset usage for %d datasets' %
                       len(bkPathPopularity))
        for infoType in ('All', 'LFN'):
            for i in range(2):
                counters.setdefault(infoType, []).append(
                    sum(
                        bkPathUsage.get(bkPath, {}).get(infoType, (0, 0))[i]
                        for bkPath in bkPathPopularity))
        for bkPath in sorted(bkPathPopularity):
            if bkPath not in datasetStorage['Disk'] | datasetStorage[
                    'Archived'] | datasetStorage['Tape']:
                ses = usedSEs.get(bkPath)
                if ses is None:
                    gLogger.error("BK path not in usedSEs", bkPath)
                else:
                    datasetStorage[storageType(ses)].add(bkPath)
            nLfns, lfnSize = bkPathUsage.get(bkPath, {}).get('LFN', (0, 0))
            nPfns, pfnSize = bkPathUsage.get(bkPath, {}).get('All', (0, 0))
            gLogger.always(
                '%s (%d LFNs, %s), (%d PFNs, %s, %.1f replicas)' %
                (bkPath, nLfns, prSize(lfnSize), nPfns, prSize(pfnSize),
                 float(nPfns) / float(nLfns) if nLfns else 0.))
            bins = sorted(bkPathPopularity[bkPath])
            lastBin = bins[-1]
            accesses = sum(bkPathPopularity[bkPath][binNumber]
                           for binNumber in bins)
            gLogger.always(
                '\tUsed first in %s, %d accesses (%.1f%%), %d accesses during last %s %s'
                % (prBinNumber(bins[0]), accesses, accesses * 100. /
                   nLfns if nLfns else 0., bkPathPopularity[bkPath][lastBin],
                   binSize, prBinNumber(lastBin)))
        gLogger.always(
            "\nA total of %d LFNs (%s), %d PFNs (%s) have been used" %
            (counters['LFN'][0], prSize(counters['LFN'][1]),
             counters['All'][0], prSize(counters['All'][1])))

    if getAllDatasets:
        # Consider only unused directories
        unusedDirectories = allDirectoriesSet - usedDirectories
        if unusedDirectories:
            gLogger.always(
                "\n============================================================="
            )
            gLogger.always('%d directories have not been used' %
                           len(unusedDirectories))
            # Remove the used datasets (from other directories)
            unusedBKPaths = set(
                bkPathForDir[dirLfn] for dirLfn in unusedDirectories
                if dirLfn in bkPathForDir) - set(bkPathPopularity)
            # Remove empty datasets
            strangeBKPaths = set(
                bkPath for bkPath in unusedBKPaths
                if not bkPathUsage.get(bkPath, {}).get('LFN', (0, 0))[0])
            if strangeBKPaths:
                gLogger.always('%d unused datasets do not have an LFN count:' %
                               len(strangeBKPaths))
                gLogger.always('\n'.join(
                    "%s : %s" % (bkPath, str(bkPathUsage.get(bkPath, {})))
                    for bkPath in strangeBKPaths))
            unusedBKPaths = set(
                bkPath for bkPath in unusedBKPaths
                if bkPathUsage.get(bkPath, {}).get('LFN', (0, 0))[0])

            # In case there are datasets both on tape and disk, priviledge tape
            datasetStorage['Disk'] -= datasetStorage['Tape']
            gLogger.always(
                "\nThe following %d BK paths were not used since %d days" %
                (len(unusedBKPaths), since))
            for infoType in storageTypes[0:3]:
                gLogger.always("\n=========== %s datasets ===========" %
                               infoType)
                unusedPaths = unusedBKPaths & datasetStorage[infoType]
                counters = {}
                for ty in ('All', 'LFN'):
                    for i in range(2):
                        counters.setdefault(ty, []).append(
                            sum(
                                bkPathUsage.get(bkPath, {}).get(ty, (0, 0))[i]
                                for bkPath in unusedPaths))
                for bkPath in sorted(unusedPaths):
                    nLfns, lfnSize = bkPathUsage.get(bkPath,
                                                     {}).get('LFN', (0, 0))
                    nPfns, pfnSize = bkPathUsage.get(bkPath,
                                                     {}).get('All', (0, 0))
                    gLogger.always(
                        '\t%s (%d LFNs, %s), (%d PFNs, %s, %.1f replicas)' %
                        (bkPath, nLfns, prSize(lfnSize), nPfns,
                         prSize(pfnSize), float(nPfns) / float(nLfns)))
                gLogger.always(
                    "\nA total of %d %s LFNs (%s), %d PFNs (%s) were not used"
                    %
                    (counters['LFN'][0], infoType, prSize(counters['LFN'][1]),
                     counters['All'][0], prSize(counters['All'][1])))
    else:
        unusedBKPaths = set()
        datasetStorage['Disk'] -= datasetStorage['Tape']

    # Now create a CSV file with all dataset information
    # Name, ProcessingPass, #files, size, SE type, each week's usage (before now)
    csvFile = 'popularity-%ddays.csv' % since if csvFile is None else csvFile
    gLogger.always(
        "\n=============================================================")
    gLogger.always('Creating %s file with %d datasets' %
                   (csvFile, len(bkPathPopularity) + len(unusedBKPaths)))
    with open(csvFile, 'w') as fd:
        title = "Name;Configuration;ProcessingPass;FileType;Type;Creation-%s;" % binSize + \
                "NbLFN;LFNSize;NbDisk;DiskSize;NbTape;TapeSize;NbArchived;ArchivedSize;" + \
                ';'.join(site.split('.')[1] for site in storageSites) + \
                ";Nb Replicas;Nb ArchReps;Storage;FirstUsage;LastUsage;Now"
        for binNumber in range(nbBins):
            title += ';%d' % (1 + binNumber)
        fd.write(title + '\n')
        teraByte = 1000. * 1000. * 1000. * 1000.
        for bkPath in sorted(bkPathPopularity) + sorted(unusedBKPaths):
            # Skip unknown datasets
            if bkPath.startswith('Unknown-'):
                continue
            # Not interested in histograms
            splitBKPath = bkPath.split('/')
            fileType = splitBKPath[-1]
            if 'HIST' in fileType:
                continue
            # Only RAW for partition LHCb may be of interest (and even...)
            if fileType == 'RAW' and not bkPath.startswith('/LHCb'):
                continue
            info = bkPathUsage.get(bkPath, {})
            # check if the production is still active
            prods = prodForBKPath[bkPath]
            res = transClient.getTransformations(
                {'TransformationID': list(prods)})
            creationTime = datetime.now()
            active = []
            for prodDict in res.get('Value', []):
                creationTime = min(creationTime, prodDict['CreationDate'])
                if prodDict['Status'] in ('Active', 'Idle', 'Completed'):
                    active.append(str(prodDict['TransformationID']))
            if active:
                gLogger.always("Active productions %s found in %s" %
                               (','.join(sorted(active)), bkPath))
            if info['LFN'][0] == 0:
                continue
            for infoType in info:
                info[infoType][1] /= teraByte
            # Some BK paths contain a , to be replaces by a . for the CSV file!!
            config = '/'.join(splitBKPath[0:3])
            if ',' in bkPath:
                gLogger.always("BK path found with ',':", bkPath)
            # Name,Configuration,ProcessingPass, FileType
            row = '%s;%s;%s;%s' % (bkPath.replace('Real Data', 'RealData'),
                                   config, processingPass.get(
                                       bkPath, 'Unknown').replace(
                                           'Real Data', 'RealData'), fileType)
            # Type
            configTypes = {'/MC/Dev': 2, '/MC/Upgrade': 3}
            configType = configTypes.get(config,
                                         0 if bkPath.startswith('/MC') else 1)
            row += ';%d' % configType
            # CreationTime
            row += ';%d' % (getTimeBin(creationTime))
            # NbLFN,LFNSize,NbDisk,DiskSize,NbTape,TapeSize, NbArchived,ArchivedSize
            for infoType in ('LFN', 'Disk', 'Tape', 'Archived'):
                row += ';%d;%f' % tuple(info[infoType])
            for site in storageSites:
                row += ';%f' % info[site][1]
            row += ';%f;%f' % (float(info['Disk'][0]) / float(info['LFN'][0]),
                               float(info['Archived'][0]) /
                               float(info['LFN'][0]))
            if active:
                dsType = 'Active'
            else:
                dsType = 'Unknown'
                for infoType in storageTypes[0:3]:
                    if bkPath in datasetStorage[infoType]:
                        dsType = infoType
                        break
            row += ';%s' % dsType
            bins = sorted(bkPathPopularity.get(bkPath, {}))
            if not bins:
                bins = [0]
            row += ';%d;%d;%d' % (bins[0], bins[-1], nowBin)
            usage = 0
            for binNumber in range(nbBins):
                usage += bkPathPopularity.get(bkPath,
                                              {}).get(nowBin - binNumber, 0)
                row += ';%d' % usage
            fd.write(row + '\n')
    gLogger.always('\nSuccessfully wrote CSV file %s' % csvFile)