def putRequest(self, requestID, taskResult=None):
    """ put back :requestID: to RequestClient

    :param str requestID: request's id
    """
    if requestID in self.__requestCache:
      request = self.__requestCache.pop(requestID)
      if taskResult:
        if taskResult['OK']:
          request = taskResult['Value']
          # The RequestTask is putting back the Done tasks, no need to redo it
          if request.Status == 'Done':
            return S_OK()
        # In case of timeout, we need to increment ourselves all the attempts
        elif cmpError(taskResult, errno.ETIME):
          waitingOp = request.getWaiting()
          for rmsFile in waitingOp.get('Value', []):
            rmsFile.Attempt += 1

      reset = self.requestClient().putRequest(request, useFailoverProxy=False, retryMainService=2)
      if not reset["OK"]:
        return S_ERROR("putRequest: unable to reset request %s: %s" % (requestID, reset["Message"]))
    else:
      return S_ERROR('Not in cache')
    return S_OK()
Exemple #2
0
    def getTransportURL(self, urls, protocols):
        """Get a transport URL for given urls
            If http/https is requested, the URLs will be valid for 24hours

        :param dict urls: s3 urls
        :param list protocols: a list of acceptable transport protocols in priority order.
                          In practice, besides 's3', it can only be:

                          * 'https' if secureConnection is True
                          * 'http' othewise

        :returns: succ/failed dict url with required protocol

        """

        res = super(S3Storage, self).getTransportURL(urls, protocols)
        # if the result is OK or the error different than errno.EPROTONOSUPPORT
        # we just return
        if not cmpError(res, errno.EPROTONOSUPPORT):
            return res

        # We support only http if it is an insecured connection and https if it is a secured connection
        if self.secureConnection and "https" not in protocols:
            return S_ERROR(errno.EPROTONOSUPPORT, "Only https protocol is supported")
        elif not self.secureConnection and "http" not in protocols:
            return S_ERROR(errno.EPROTONOSUPPORT, "Only http protocol is supported")

        # Make the presigned URLs valid for 24h
        if self.directAccess:
            return self.createPresignedUrl(urls, "get_object", expiration=60 * 60 * 24)

        return self.S3GatewayClient.createPresignedUrl(self.name, "get_object", urls, expiration=60 * 60 * 24)
Exemple #3
0
    def __extractCSData(self, section):
        """Extract limiting information from the CS in the form:
        { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } }
        """
        stuffDict = self.csDictCache.get(section)
        if stuffDict:
            return S_OK(stuffDict)

        result = self.__opsHelper.getSections(section)
        if not result["OK"]:
            if cmpError(result, ESECTION):
                return S_OK({})
            return result
        attribs = result["Value"]
        stuffDict = {}
        for attName in attribs:
            result = self.__opsHelper.getOptionsDict("%s/%s" % (section, attName))
            if not result["OK"]:
                return result
            attLimits = result["Value"]
            try:
                attLimits = dict([(k, int(attLimits[k])) for k in attLimits])
            except Exception as excp:
                errMsg = "%s/%s has to contain numbers: %s" % (section, attName, str(excp))
                self.log.error(errMsg)
                return S_ERROR(errMsg)
            stuffDict[attName] = attLimits

        self.csDictCache.add(section, 300, stuffDict)
        return S_OK(stuffDict)
Exemple #4
0
  def putRequest( self, requestID, taskResult = None ):
    """ put back :requestID: to RequestClient

    :param str requestID: request's id
    """
    if requestID in self.__requestCache:
      request = self.__requestCache.pop( requestID )
      if taskResult:
        if taskResult['OK']:
          request = taskResult['Value']
          # The RequestTask is putting back the Done tasks, no need to redo it
          if request.Status == 'Done':
            return S_OK()
        # In case of timeout, we need to increment ourselves all the attempts
        elif cmpError( taskResult, errno.ETIME ):
          waitingOp = request.getWaiting()
          for rmsFile in waitingOp.get( 'Value', [] ):
            rmsFile.Attempt += 1

      reset = self.requestClient().putRequest( request, useFailoverProxy = False, retryMainService = 2 )
      if not reset["OK"]:
        return S_ERROR( "putRequest: unable to reset request %s: %s" % ( requestID, reset["Message"] ) )
    else:
      return S_ERROR( 'Not in cache' )
    return S_OK()
Exemple #5
0
def test_submit_multiHopTransfer_failure_badLFN():
    """Do a a multiHop transfer, but the LFN is bad (first loop failure)"""

    newJob = generateFTS3Job("CNAF-DST", "RAL-DST", ["/badLFN/f1"], multiHopSE="CERN-DST")

    res = newJob._constructTransferJob(3600, [f.lfn for f in newJob.filesToSubmit], "")
    assert not res["OK"]
    assert cmpError(res, errno.ENODATA)
Exemple #6
0
def test_submit_multiHopTransfer_failure_multipleFiles():
    """multihop with more than one file (not allowed)"""

    newJob = generateFTS3Job("CNAF-DST", "RAL-DST", ["/lhcb/f1", "/lhcb/f2"], multiHopSE="CERN-DST")

    res = newJob._constructTransferJob(3600, [f.lfn for f in newJob.filesToSubmit], "")
    assert not res["OK"]
    assert cmpError(res, errno.E2BIG)
Exemple #7
0
def test_submit_multiHopStaging_multipleFiles():
    """A multihop transfer cannot have more than one file at the time"""

    newJob = generateFTS3Job("CERN-RAW", "CNAF-DST", ["/lhcb/f1", "/lhcb/f2"])

    res = newJob._constructTransferJob(3600, [f.lfn for f in newJob.filesToSubmit], "")
    assert not res["OK"]
    assert cmpError(res, errno.E2BIG)
Exemple #8
0
def test_submit_multiHopStaging_failureBadLFN():
    """We do a multi hop stage that fails because of a bad LFN"""

    newJob = generateFTS3Job("CERN-RAW", "CNAF-DST", ["/badLFN/f1"])
    # We should get a complete failure
    res = newJob._constructTransferJob(3600, [f.lfn for f in newJob.filesToSubmit], "")
    assert not res["OK"]
    assert cmpError(res, errno.ENODATA)
Exemple #9
0
def test_submit_directJob_allFailed():
    """Simple transfer of two files, with all LFNs problematic"""

    newJob = generateFTS3Job("CERN-DST", "CNAF-DST", ["/badLFN/f1", "/badLFN/f2"])
    # We should get a complete failure
    res = newJob._constructTransferJob(3600, [f.lfn for f in newJob.filesToSubmit], "")
    assert not res["OK"]
    assert cmpError(res, errno.ENODATA)
Exemple #10
0
    def prepareNewJobs(self, maxFilesPerJob=100, maxAttemptsPerFile=10):

        log = self._log.getSubLogger("_prepareNewJobs", child=True)

        filesToSubmit = self._getFilesToSubmit(
            maxAttemptsPerFile=maxAttemptsPerFile)
        log.debug("%s ftsFiles to submit" % len(filesToSubmit))

        newJobs = []

        # {targetSE : [FTS3Files] }
        res = FTS3Utilities.groupFilesByTarget(filesToSubmit)
        if not res['OK']:
            return res
        filesGroupedByTarget = res['Value']

        for targetSE, ftsFiles in filesGroupedByTarget.iteritems():

            res = self._checkSEAccess(targetSE, 'WriteAccess', vo=self.vo)

            if not res['OK']:
                # If the SE is currently banned, we just skip it
                if cmpError(res, errno.EACCES):
                    log.info(
                        "Write access currently not permitted to %s, skipping."
                        % targetSE)
                else:
                    log.error(res)
                    for ftsFile in ftsFiles:
                        ftsFile.attempt += 1
                continue

            sourceSEs = self.sourceSEs.split(
                ',') if self.sourceSEs is not None else []
            # { sourceSE : [FTSFiles] }
            res = FTS3Utilities.selectUniqueRandomSource(
                ftsFiles, allowedSources=sourceSEs)

            if not res['OK']:
                return res

            uniqueTransfersBySource = res['Value']

            # We don't need to check the source, since it is already filtered by the DataManager
            for sourceSE, ftsFiles in uniqueTransfersBySource.iteritems():

                for ftsFilesChunk in breakListIntoChunks(
                        ftsFiles, maxFilesPerJob):

                    newJob = self._createNewJob('Transfer',
                                                ftsFilesChunk,
                                                targetSE,
                                                sourceSE=sourceSE)

                    newJobs.append(newJob)

        return S_OK(newJobs)
Exemple #11
0
def test_submit_multiHopTransfer_failure_protocolSecondHop():
    """Multi hop with second hop impossible (no protocol compatible between CNAF and RAL"""

    newJob = generateFTS3Job("CERN-DST", "RAL-DST", ["/lhcb/f1"], multiHopSE="CNAF-DST")

    res = newJob._constructTransferJob(3600, [f.lfn for f in newJob.filesToSubmit], "")
    assert not res["OK"]
    # Check that the error is no common protocol
    assert cmpError(res, errno.ENOPROTOOPT)
Exemple #12
0
def test_submit_direct_noPotocol():
    """Direct transfer with no common protocol. It is a failure"""

    newJob = generateFTS3Job("CNAF-DST", "RAL-DST", ["/lhcb/f1", "/lhcb/f2"])

    res = newJob._constructTransferJob(3600, [f.lfn for f in newJob.filesToSubmit], "")

    assert not res["OK"]
    # Check that the error is no common protocol
    assert cmpError(res, errno.ENOPROTOOPT)
Exemple #13
0
    def executeRPC(self, functionName, args):
        """Perform the RPC call, connect before and disconnect after.

        :param functionName: name of the function
        :param args: arguments to the function

        :return: in case of success, the return of the server call. In any case
                we add the connection stub to it.


        """
        retVal = self._connect()

        # Generate the stub which contains all the connection and call options
        # JSON: cast args to list for serialization purposes
        stub = [self._getBaseStub(), functionName, list(args)]
        if not retVal["OK"]:
            retVal["rpcStub"] = stub
            return retVal
        # Get the transport connection ID as well as the Transport object
        trid, transport = retVal["Value"]
        try:
            # Handshake to perform the RPC call for functionName
            retVal = self._proposeAction(transport, ("RPC", functionName))
            if not retVal["OK"]:
                if cmpError(retVal, ENOAUTH):  # This query is unauthorized
                    retVal["rpcStub"] = stub
                    return retVal
                else:  # we have network problem or the service is not responding
                    if self.__retry < 3:
                        self.__retry += 1
                        return self.executeRPC(functionName, args)
                    else:
                        retVal["rpcStub"] = stub
                        return retVal

            # Send the arguments to the function
            # Note: we need to convert the arguments to list
            # We do not need to deseralize it because variadic functions
            # can work with list too
            retVal = transport.sendData(S_OK(list(args)))
            if not retVal["OK"]:
                return retVal

            # Get the result of the call and append the stub to it
            # Note that the RPC timeout basically ticks here, since
            # the client waits for data for as long as the server side
            # processes the request.
            receivedData = transport.receiveData()
            if isinstance(receivedData, dict):
                receivedData["rpcStub"] = stub
            return receivedData
        finally:
            self._disconnect(trid)
Exemple #14
0
  def prepareNewJobs(self, maxFilesPerJob=100, maxAttemptsPerFile=10):

    log = self._log.getSubLogger("_prepareNewJobs", child=True)

    filesToSubmit = self._getFilesToSubmit(maxAttemptsPerFile=maxAttemptsPerFile)
    log.debug("%s ftsFiles to submit" % len(filesToSubmit))

    newJobs = []

    # {targetSE : [FTS3Files] }
    res = FTS3Utilities.groupFilesByTarget(filesToSubmit)
    if not res['OK']:
      return res
    filesGroupedByTarget = res['Value']

    for targetSE, ftsFiles in filesGroupedByTarget.iteritems():

      res = self._checkSEAccess(targetSE, 'WriteAccess', vo=self.vo)

      if not res['OK']:
        # If the SE is currently banned, we just skip it
        if cmpError(res, errno.EACCES):
          log.info("Write access currently not permitted to %s, skipping." % targetSE)
        else:
          log.error(res)
          for ftsFile in ftsFiles:
            ftsFile.attempt += 1
        continue

      sourceSEs = self.sourceSEs.split(',') if self.sourceSEs is not None else []
      # { sourceSE : [FTSFiles] }
      res = FTS3Utilities.selectUniqueRandomSource(ftsFiles, allowedSources=sourceSEs)

      if not res['OK']:
        return res

      uniqueTransfersBySource = res['Value']

      # We don't need to check the source, since it is already filtered by the DataManager
      for sourceSE, ftsFiles in uniqueTransfersBySource.iteritems():

        for ftsFilesChunk in breakListIntoChunks(ftsFiles, maxFilesPerJob):

          newJob = self._createNewJob('Transfer', ftsFilesChunk, targetSE, sourceSE=sourceSE)

          newJobs.append(newJob)

    return S_OK(newJobs)
  def cleanTransformationLogFiles(self, directory):
    """ clean up transformation logs from directory :directory:

    :param self: self reference
    :param str directory: folder name
    """
    self.log.verbose("Removing log files found in the directory %s" % directory)
    res = returnSingleResult(StorageElement(self.logSE).removeDirectory(directory, recursive=True))
    if not res['OK']:
      if cmpError(res, errno.ENOENT):  # No such file or directory
        self.log.warn("Transformation log directory does not exist", directory)
        return S_OK()
      self.log.error("Failed to remove log files", res['Message'])
      return res
    self.log.info("Successfully removed transformation log directory")
    return S_OK()
Exemple #16
0
    def executeRPC(self, functionName, args):
        """ Perform the RPC call, connect before and disconnect after.

        :param functionName: name of the function
        :param args: arguments to the function

        :return: in case of success, the return of the server call. In any case
                we add the connection stub to it.


    """
        retVal = self._connect()

        # Generate the stub which contains all the connection and call options
        stub = (self._getBaseStub(), functionName, args)
        if not retVal['OK']:
            retVal['rpcStub'] = stub
            return retVal
        # Get the transport connection ID as well as the Transport object
        trid, transport = retVal['Value']
        try:
            # Handshake to perform the RPC call for functionName
            retVal = self._proposeAction(transport, ("RPC", functionName))
            if not retVal['OK']:
                if cmpError(retVal, ENOAUTH):  # This query is unauthorized
                    retVal['rpcStub'] = stub
                    return retVal
                else:  # we have network problem or the service is not responding
                    if self.__retry < 3:
                        self.__retry += 1
                        return self.executeRPC(functionName, args)
                    else:
                        retVal['rpcStub'] = stub
                        return retVal

            # Send the arguments to the function
            retVal = transport.sendData(S_OK(args))
            if not retVal['OK']:
                return retVal

            # Get the result of the call and append the stub to it
            receivedData = transport.receiveData()
            if isinstance(receivedData, dict):
                receivedData['rpcStub'] = stub
            return receivedData
        finally:
            self._disconnect(trid)
Exemple #17
0
  def executeRPC( self, functionName, args ):
    """ Perform the RPC call, connect before and disconnect after.

        :param functionName: name of the function
        :param args: arguments to the function

        :return: in case of success, the return of the server call. In any case
                we add the connection stub to it.


    """
    retVal = self._connect()

    # Generate the stub which contains all the connection and call options
    stub = ( self._getBaseStub(), functionName, args )
    if not retVal[ 'OK' ]:
      retVal[ 'rpcStub' ] = stub
      return retVal
    # Get the transport connection ID as well as the Transport object
    trid, transport = retVal[ 'Value' ]
    try:
      # Handshake to perform the RPC call for functionName
      retVal = self._proposeAction( transport, ( "RPC", functionName ) )
      if not retVal['OK']:
        if cmpError( retVal, ENOAUTH ):  # This query is unauthorized
          retVal[ 'rpcStub' ] = stub
          return retVal
        else:  # we have network problem or the service is not responding
          if self.__retry < 3:
            self.__retry += 1
            return self.executeRPC( functionName, args )
          else:
            retVal[ 'rpcStub' ] = stub
            return retVal

      # Send the arguments to the function
      retVal = transport.sendData( S_OK( args ) )
      if not retVal[ 'OK' ]:
        return retVal

      # Get the result of the call and append the stub to it
      receivedData = transport.receiveData()
      if isinstance( receivedData, dict ):
        receivedData[ 'rpcStub' ] = stub
      return receivedData
    finally:
      self._disconnect( trid )
    def cleanTransformationLogFiles(self, directory):
        """ clean up transformation logs from directory :directory:

    :param self: self reference
    :param str directory: folder name
    """
        self.log.verbose("Removing log files found in the directory %s" %
                         directory)
        res = returnSingleResult(
            StorageElement(self.logSE).removeDirectory(directory,
                                                       recursive=True))
        if not res['OK']:
            if cmpError(res, errno.ENOENT):  # No such file or directory
                self.log.warn("Transformation log directory does not exist",
                              directory)
                return S_OK()
            self.log.error("Failed to remove log files", res['Message'])
            return res
        self.log.info("Successfully removed transformation log directory")
        return S_OK()
Exemple #19
0
    def __fetchSpaceToken(seName):
        """ Fetch the space token of storage element

        :param seName name of the storageElement

        :returns space token. If there is no SpaceToken defined, returns None
    """
        seToken = None
        if seName:
            seObj = StorageElement(seName)

            res = seObj.getStorageParameters(protocol='srm')
            if not res['OK']:
                # If there is no SRM protocol, we do not specify
                # the space token
                if cmpError(res, errno.ENOPROTOOPT):
                    return S_OK(None)

                return res

            seToken = res["Value"].get("SpaceToken")

        return S_OK(seToken)
Exemple #20
0
  def __fetchSpaceToken(seName):
    """ Fetch the space token of storage element

        :param seName name of the storageElement

        :returns space token. If there is no SpaceToken defined, returns None
    """
    seToken = None
    if seName:
      seObj = StorageElement(seName)

      res = seObj.getStorageParameters(protocol='srm')
      if not res['OK']:
        # If there is no SRM protocol, we do not specify
        # the space token
        if cmpError(res, errno.ENOPROTOOPT):
          return S_OK(None)

        return res

      seToken = res["Value"].get("SpaceToken")

    return S_OK(seToken)
Exemple #21
0
def getFilesToStage(lfnList,
                    jobState=None,
                    checkOnlyTapeSEs=None,
                    jobLog=None):
    """ Utility that returns out of a list of LFNs those files that are offline,
      and those for which at least one copy is online
  """
    if not lfnList:
        return S_OK({
            'onlineLFNs': [],
            'offlineLFNs': {},
            'failedLFNs': [],
            'absentLFNs': {}
        })

    dm = DataManager()
    if isinstance(lfnList, six.string_types):
        lfnList = [lfnList]

    lfnListReplicas = dm.getReplicasForJobs(lfnList, getUrl=False)
    if not lfnListReplicas['OK']:
        return lfnListReplicas

    offlineLFNsDict = {}
    onlineLFNs = {}
    offlineLFNs = {}
    absentLFNs = {}
    failedLFNs = set()
    if lfnListReplicas['Value']['Failed']:
        # Check if files are not existing
        for lfn, reason in lfnListReplicas['Value']['Failed'].items():
            # FIXME: awful check until FC returns a proper error
            if cmpError(reason, errno.ENOENT) or 'No such file' in reason:
                # The file doesn't exist, job must be Failed
                # FIXME: it is not possible to return here an S_ERROR(), return the message only
                absentLFNs[lfn] = S_ERROR(errno.ENOENT,
                                          'File not in FC')['Message']
        if absentLFNs:
            return S_OK({
                'onlineLFNs': list(onlineLFNs),
                'offlineLFNs': offlineLFNsDict,
                'failedLFNs': list(failedLFNs),
                'absentLFNs': absentLFNs
            })
        return S_ERROR("Failures in getting replicas")

    lfnListReplicas = lfnListReplicas['Value']['Successful']
    # If a file is reported here at a tape SE, it is not at a disk SE as we use disk in priority
    # We shall check all file anyway in order to make sure they exist
    seToLFNs = dict()
    for lfn, ses in lfnListReplicas.items():
        for se in ses:
            seToLFNs.setdefault(se, list()).append(lfn)

    if seToLFNs:
        if jobState:
            # Get user name and group from the job state
            userName = jobState.getAttribute('Owner')
            if not userName['OK']:
                return userName
            userName = userName['Value']

            userGroup = jobState.getAttribute('OwnerGroup')
            if not userGroup['OK']:
                return userGroup
            userGroup = userGroup['Value']
        else:
            userName = None
            userGroup = None
        # Check whether files are Online or Offline, or missing at SE
        result = _checkFilesToStage(
            seToLFNs,
            onlineLFNs,
            offlineLFNs,
            absentLFNs,  # pylint: disable=unexpected-keyword-arg
            checkOnlyTapeSEs=checkOnlyTapeSEs,
            jobLog=jobLog,
            proxyUserName=userName,
            proxyUserGroup=userGroup,
            executionLock=True)

        if not result['OK']:
            return result
        failedLFNs = set(lfnList) - set(onlineLFNs) - set(offlineLFNs) - set(
            absentLFNs)

        # Get the online SEs
        dmsHelper = DMSHelpers()
        onlineSEs = set(se for ses in onlineLFNs.values() for se in ses)
        onlineSites = set(
            dmsHelper.getLocalSiteForSE(se).get('Value')
            for se in onlineSEs) - {None}
        for lfn in offlineLFNs:
            ses = offlineLFNs[lfn]
            if len(ses) == 1:
                # No choice, let's go
                offlineLFNsDict.setdefault(ses[0], list()).append(lfn)
                continue
            # Try and get an SE at a site already with online files
            found = False
            if onlineSites:
                # If there is at least one online site, select one
                for se in ses:
                    site = dmsHelper.getLocalSiteForSE(se)
                    if site['OK']:
                        if site['Value'] in onlineSites:
                            offlineLFNsDict.setdefault(se, list()).append(lfn)
                            found = True
                            break
            # No online site found in common, select randomly
            if not found:
                offlineLFNsDict.setdefault(random.choice(ses),
                                           list()).append(lfn)

    return S_OK({
        'onlineLFNs': list(onlineLFNs),
        'offlineLFNs': offlineLFNsDict,
        'failedLFNs': list(failedLFNs),
        'absentLFNs': absentLFNs,
        'onlineSites': onlineSites
    })
Exemple #22
0
def _checkFilesToStage(seToLFNs,
                       onlineLFNs,
                       offlineLFNs,
                       absentLFNs,
                       checkOnlyTapeSEs=None,
                       jobLog=None,
                       proxyUserName=None,
                       proxyUserGroup=None,
                       executionLock=None):
    """
  Checks on SEs whether the file is NEARLINE or ONLINE
  onlineLFNs, offlineLFNs and absentLFNs are modified to contain the files found online
  If checkOnlyTapeSEs is True, disk replicas are not checked
  As soon as a replica is found Online for a file, no further check is made
  """
    # Only check on storage if it is a tape SE
    if jobLog is None:
        logger = gLogger
    else:
        logger = jobLog
    if checkOnlyTapeSEs is None:
        # Default value is True
        checkOnlyTapeSEs = True

    failed = {}
    for se, lfnsInSEList in seToLFNs.items():
        # If we have found already all files online at another SE, no need to check the others
        # but still we want to set the SE as Online if not a TapeSE
        vo = getVOForGroup(proxyUserGroup)
        seObj = StorageElement(se, vo=vo)
        status = seObj.getStatus()
        if not status['OK']:
            return status
        tapeSE = status['Value']['TapeSE']
        diskSE = status['Value']['DiskSE']
        # If requested to check only Tape SEs and the file is at a diskSE, we guess it is Online...
        filesToCheck = []
        for lfn in lfnsInSEList:
            # If the file had already been found accessible at an SE, only check that this one is on disk
            diskIsOK = checkOnlyTapeSEs or (lfn in onlineLFNs)
            if diskIsOK and diskSE:
                onlineLFNs.setdefault(lfn, []).append(se)
            elif not diskIsOK or (tapeSE and (lfn not in onlineLFNs)):
                filesToCheck.append(lfn)
        if not filesToCheck:
            continue

        # We have to use a new SE object because it caches the proxy!
        with UserProxy(proxyUserName=proxyUserName,
                       proxyUserGroup=proxyUserGroup,
                       executionLock=executionLock) as proxyResult:
            if proxyResult['OK']:
                fileMetadata = StorageElement(
                    se, vo=vo).getFileMetadata(filesToCheck)
            else:
                fileMetadata = proxyResult

        if not fileMetadata['OK']:
            failed[se] = dict.fromkeys(filesToCheck, fileMetadata['Message'])
        else:
            if fileMetadata['Value']['Failed']:
                failed[se] = fileMetadata['Value']['Failed']
            # is there at least one replica online?
            for lfn, mDict in fileMetadata['Value']['Successful'].items():
                # SRM returns Cached, but others may only return Accessible
                if mDict.get('Cached', mDict['Accessible']):
                    onlineLFNs.setdefault(lfn, []).append(se)
                elif tapeSE:
                    # A file can be staged only at Tape SE
                    offlineLFNs.setdefault(lfn, []).append(se)
                else:
                    # File not available at a diskSE... we shall retry later
                    pass

    # Doesn't matter if some files are Offline if they are also online
    for lfn in set(offlineLFNs) & set(onlineLFNs):
        offlineLFNs.pop(lfn)

    # If the file was found staged, ignore possible errors, but print out errors
    for se, failedLfns in list(failed.items()):
        logger.error("Errors when getting files metadata", 'at %s' % se)
        for lfn, reason in list(failedLfns.items()):
            if lfn in onlineLFNs:
                logger.warn(reason,
                            'for %s, but there is an online replica' % lfn)
                failed[se].pop(lfn)
            else:
                logger.error(reason, 'for %s, no online replicas' % lfn)
                if cmpError(reason, errno.ENOENT):
                    absentLFNs.setdefault(lfn, []).append(se)
                    failed[se].pop(lfn)
        if not failed[se]:
            failed.pop(se)
    # Find the files that do not exist at SE
    if failed:
        logger.error(
            "Error getting metadata", "for %d files" %
            len(set(lfn for lfnList in failed.values() for lfn in lfnList)))

    for lfn in absentLFNs:
        seList = absentLFNs[lfn]
        # FIXME: it is not possible to return here an S_ERROR(), return the message only
        absentLFNs[lfn] = S_ERROR(errno.ENOENT, "File not at %s" %
                                  ','.join(sorted(seList)))['Message']
    # Format the error for absent files
    return S_OK()
Exemple #23
0
  def execute( self ):
    """ read requests from RequestClient and enqueue them into ProcessPool """
    gMonitor.addMark( "Iteration", 1 )
    # # requests (and so tasks) counter
    taskCounter = 0
    while taskCounter < self.__requestsPerCycle:
      self.log.debug( "execute: executing %d request in this cycle" % taskCounter )

      requestsToExecute = []

      if not self.__bulkRequest:
        self.log.info( "execute: ask for a single request" )
        getRequest = self.requestClient().getRequest()
        if not getRequest["OK"]:
          self.log.error( "execute: %s" % getRequest["Message"] )
          break
        if not getRequest["Value"]:
          self.log.info( "execute: no more 'Waiting' requests to process" )
          break
        requestsToExecute = [getRequest["Value"] ]
      else:
        numberOfRequest = min( self.__bulkRequest, self.__requestsPerCycle - taskCounter )
        self.log.info( "execute: ask for %s requests" % numberOfRequest )
        getRequests = self.requestClient().getBulkRequests( numberOfRequest )
        if not getRequests["OK"]:
          self.log.error( "execute: %s" % getRequests["Message"] )
          break
        if not getRequests["Value"]:
          self.log.info( "execute: no more 'Waiting' requests to process" )
          break
        for rId in getRequests["Value"]["Failed"]:
          self.log.error( "execute: %s" % getRequests["Value"]["Failed"][rId] )

        requestsToExecute = getRequests["Value"]["Successful"].values()

      self.log.info( "execute: will execute %s requests " % len( requestsToExecute ) )

      for request in requestsToExecute:
        # # set task id
        taskID = request.RequestID

        self.log.info( "processPool tasks idle = %s working = %s" % ( self.processPool().getNumIdleProcesses(),
                                                                      self.processPool().getNumWorkingProcesses() ) )

        looping = 0
        while True:
          if not self.processPool().getFreeSlots():
            if not looping:
              self.log.info( "No free slots available in processPool, will wait %d seconds to proceed" % self.__poolSleep )
            time.sleep( self.__poolSleep )
            looping += 1
          else:
            if looping:
              self.log.info( "Free slot found after %d seconds" % looping * self.__poolSleep )
            looping = 0
            # # save current request in cache
            res = self.cacheRequest( request )
            if not res['OK']:
              if cmpError( res, errno.EALREADY ):
                # The request is already in the cache, skip it. break out of the while loop to get next request
                break
              # There are too many requests in the cache, commit suicide
              self.log.error( res['Message'], '(%d requests): put back all requests and exit cycle' % len( self.__requestCache ) )
              self.putAllRequests()
              return res
            # # serialize to JSON
            result = request.toJSON()
            if not result['OK']:
              continue
            requestJSON = result['Value']
            self.log.info( "spawning task for request '%s/%s'" % ( request.RequestID, request.RequestName ) )
            timeOut = self.getTimeout( request )
            enqueue = self.processPool().createAndQueueTask( RequestTask,
                                                             kwargs = { "requestJSON" : requestJSON,
                                                                        "handlersDict" : self.handlersDict,
                                                                        "csPath" : self.__configPath,
                                                                        "agentName": self.agentName },
                                                             taskID = taskID,
                                                             blocking = True,
                                                             usePoolCallbacks = True,
                                                             timeOut = timeOut )
            if not enqueue["OK"]:
              self.log.error( enqueue["Message"] )
            else:
              self.log.debug( "successfully enqueued task '%s'" % taskID )
              # # update monitor
              gMonitor.addMark( "Processed", 1 )
              # # update request counter
              taskCounter += 1
              # # task created, a little time kick to proceed
              time.sleep( 0.1 )
              break

    self.log.info( 'Flushing callbacks (%d requests still in cache)' % len( self.__requestCache ) )
    processed = self.processPool().processResults()
    # This happens when the result queue is screwed up.
    # Returning S_ERROR proved not to be sufficient,
    # and when in this situation, there is nothing we can do.
    # So we just exit. runit will restart from scratch.
    if processed < 0:
      self.log.fatal("Results queue is screwed up")
      sys.exit(1)
    # # clean return
    return S_OK()
Exemple #24
0
    def prepareNewJobs(self, maxFilesPerJob=100, maxAttemptsPerFile=10):

        log = self._log.getSubLogger("_prepareNewJobs")

        filesToSubmit = self._getFilesToSubmit(
            maxAttemptsPerFile=maxAttemptsPerFile)
        log.debug("%s ftsFiles to submit" % len(filesToSubmit))

        newJobs = []

        # {targetSE : [FTS3Files] }
        res = FTS3Utilities.groupFilesByTarget(filesToSubmit)
        if not res["OK"]:
            return res
        filesGroupedByTarget = res["Value"]

        for targetSE, ftsFiles in filesGroupedByTarget.items():

            res = self._checkSEAccess(targetSE, "WriteAccess", vo=self.vo)

            if not res["OK"]:
                # If the SE is currently banned, we just skip it
                if cmpError(res, errno.EACCES):
                    log.info(
                        "Write access currently not permitted to %s, skipping."
                        % targetSE)
                else:
                    log.error(res)
                    for ftsFile in ftsFiles:
                        ftsFile.attempt += 1
                continue

            sourceSEs = self.sourceSEs.split(
                ",") if self.sourceSEs is not None else []
            # { sourceSE : [FTSFiles] }
            res = FTS3Utilities.selectUniqueSource(ftsFiles,
                                                   self.fts3Plugin,
                                                   allowedSources=sourceSEs)

            if not res["OK"]:
                return res

            uniqueTransfersBySource, failedFiles = res["Value"]

            # Treat the errors of the failed files
            for ftsFile, errMsg in failedFiles.items():
                log.error("Error when selecting random sources",
                          "%s, %s" % (ftsFile.lfn, errMsg))
                # If the error is that the file does not exist in the catalog
                # fail it !
                if cmpError(errMsg, errno.ENOENT):
                    log.error("The file does not exist, setting it Defunct",
                              "%s" % ftsFile.lfn)
                    ftsFile.status = "Defunct"

            # We don't need to check the source, since it is already filtered by the DataManager
            for sourceSE, ftsFiles in uniqueTransfersBySource.items():

                # Checking whether we will need multiHop transfer
                multiHopSE = self.fts3Plugin.findMultiHopSEToCoverUpForWLCGFailure(
                    sourceSE, targetSE)
                if multiHopSE:

                    log.verbose(
                        "WLCG failure manifestation, use %s for multihop, max files per job is 1"
                        % multiHopSE)

                    # Check that we can write and read from it
                    try:
                        for accessType in ("Read", "Write"):
                            res = self._checkSEAccess(multiHopSE,
                                                      "%sAccess" % accessType,
                                                      vo=self.vo)

                            if not res["OK"]:
                                # If the SE is currently banned, we just skip it
                                if cmpError(res, errno.EACCES):
                                    log.info(
                                        "Access currently not permitted",
                                        "%s to %s" % (accessType, multiHopSE))

                                else:
                                    log.error("CheckSEAccess error", res)
                                    for ftsFile in ftsFiles:
                                        ftsFile.attempt += 1
                                # If we have a problem with the multiHop SE,
                                # we skip the whole loop for the pair
                                # (targetSE, sourceSE)
                                raise RuntimeError("MultiHopSE unavailable")
                    except RuntimeError as e:
                        log.info(
                            "Problem with multiHop SE, skipping transfers from %s to %s."
                            % (sourceSE, targetSE))
                        continue

                    maxFilesPerJob = 1
                # Check if we need a multihop staging
                elif self.__needsMultiHopStaging(sourceSE, targetSE):
                    log.verbose(
                        "Needs multihop staging, max files per job is 1")
                    maxFilesPerJob = 1

                for ftsFilesChunk in breakListIntoChunks(
                        ftsFiles, maxFilesPerJob):

                    newJob = self._createNewJob("Transfer",
                                                ftsFilesChunk,
                                                targetSE,
                                                sourceSE=sourceSE,
                                                multiHopSE=multiHopSE)

                    newJobs.append(newJob)

        return S_OK(newJobs)
def _checkFilesToStage( seToLFNs, onlineLFNs, offlineLFNs, absentLFNs,
                        checkOnlyTapeSEs = None, jobLog = None,
                        proxyUserName = None,
                        proxyUserGroup = None,
                        executionLock = None ):
  """
  Checks on SEs whether the file is NEARLINE or ONLINE
  onlineLFNs, offlineLFNs and absentLFNs are modified to contain the files found online
  If checkOnlyTapeSEs is True, disk replicas are not checked
  As soon as a replica is found Online for a file, no further check is made
  """
  # Only check on storage if it is a tape SE
  if jobLog is None:
    logger = gLogger
  else:
    logger = jobLog
  if checkOnlyTapeSEs is None:
    # Default value is True
    checkOnlyTapeSEs = True

  failed = {}
  for se, lfnsInSEList in seToLFNs.iteritems():
    # No need to check files that are already known to be Online
    lfnsInSEList = list( set( lfnsInSEList ) - onlineLFNs )
    if not lfnsInSEList:
      continue

    seObj = StorageElement( se )
    status = seObj.getStatus()
    if not status['OK']:
      logger.error( "Could not get SE status", "%s - %s" % ( se, status['Message'] ) )
      return status
    tapeSE = status['Value']['TapeSE']
    # If requested to check only Tape SEs and  the file is at a diskSE, we guess it is Online...
    if checkOnlyTapeSEs and not tapeSE:
      onlineLFNs.update( lfnsInSEList )
      continue

    # Wrap the SE method with executeWithUserProxy
    fileMetadata = ( executeWithUserProxy( seObj.getFileMetadata )
                    ( lfnsInSEList,
                      proxyUserName = proxyUserName,
                      proxyUserGroup = proxyUserGroup,
                      executionLock = executionLock ) )

    if not fileMetadata['OK']:
      failed[se] = dict.fromkeys( lfnsInSEList, fileMetadata['Message'] )
    else:
      if fileMetadata['Value']['Failed']:
        failed[se] = fileMetadata['Value']['Failed']
      # is there at least one replica online?
      for lfn, mDict in fileMetadata['Value']['Successful'].iteritems():
        # SRM returns Cached, but others may only return Accessible
        if mDict.get( 'Cached', mDict['Accessible'] ):
          onlineLFNs.add( lfn )
        elif tapeSE:
          # A file can be staged only at Tape SE
          offlineLFNs.setdefault( lfn, [] ).append( se )
        else:
          # File not available at a diskSE... we shall retry later
          pass

  # Doesn't matter if some files are Offline if they are also online
  for lfn in set( offlineLFNs ) & onlineLFNs:
    offlineLFNs.pop( lfn )

  # If the file was found staged, ignore possible errors, but print out errors
  for se, failedLfns in failed.items():
    logger.error( "Errors when getting files metadata", 'at %s' % se )
    for lfn, reason in failedLfns.items():
      if lfn in onlineLFNs:
        logger.warn( reason, 'for %s, but there is an online replica' % lfn )
        failed[se].pop( lfn )
      else:
        logger.error( reason, 'for %s, no online replicas' % lfn )
        if cmpError( reason, errno.ENOENT ):
          absentLFNs.setdefault( lfn, [] ).append( se )
          failed[se].pop( lfn )
    if not failed[se]:
      failed.pop( se )
  # Find the files that do not exist at SE
  if failed:
    logger.error( "Error getting metadata", "for %d files" % len( set( lfn for lfnList in failed.itervalues() for lfn in lfnList ) ) )

  for lfn in absentLFNs:
    seList = absentLFNs[lfn]
    # FIXME: it is not possible to return here an S_ERROR(), return the message only
    absentLFNs[lfn] = S_ERROR( errno.ENOENT, "File not at %s" % ','.join( seList ) )['Message']
  # Format the error for absent files
  return S_OK()
Exemple #26
0
    def _treatOperation(self, operation):
        """Treat one operation:
          * does the callback if the operation is finished
          * generate new jobs and submits them

        :param operation: the operation to treat

        :return: operation, S_OK()/S_ERROR()
        """
        try:
            threadID = current_process().name
            log = gLogger.getLocalSubLogger("treatOperation/%s" % operation.operationID)

            # If the operation is totally processed
            # we perform the callback
            if operation.isTotallyProcessed():
                log.debug("FTS3Operation %s is totally processed" % operation.operationID)
                res = operation.callback()

                if not res["OK"]:
                    log.error("Error performing the callback", res)
                    log.info("Putting back the operation")
                    dbRes = self.fts3db.persistOperation(operation)

                    if not dbRes["OK"]:
                        log.error("Could not persist operation", dbRes)

                    return operation, res

            else:
                log.debug("FTS3Operation %s is not totally processed yet" % operation.operationID)

                # This flag is set to False if we want to stop the ongoing processing
                # of an operation, typically when the matching RMS Request has been
                # canceled (see below)
                continueOperationProcessing = True

                # Check the status of the associated RMS Request.
                # If it is canceled or does not exist anymore then we will not create new FTS3Jobs, and mark
                # this as FTS3Operation canceled.

                if operation.rmsReqID:
                    res = ReqClient().getRequestStatus(operation.rmsReqID)
                    if not res["OK"]:
                        # If the Request does not exist anymore
                        if cmpError(res, errno.ENOENT):
                            log.info(
                                "The RMS Request does not exist anymore, canceling the FTS3Operation",
                                "rmsReqID: %s, FTS3OperationID: %s" % (operation.rmsReqID, operation.operationID),
                            )
                            operation.status = "Canceled"
                            continueOperationProcessing = False
                        else:
                            log.error("Could not get request status", res)
                            return operation, res

                    else:
                        rmsReqStatus = res["Value"]

                        if rmsReqStatus == "Canceled":
                            log.info(
                                "The RMS Request is canceled, canceling the FTS3Operation",
                                "rmsReqID: %s, FTS3OperationID: %s" % (operation.rmsReqID, operation.operationID),
                            )
                            operation.status = "Canceled"
                            continueOperationProcessing = False

                if continueOperationProcessing:

                    res = operation.prepareNewJobs(
                        maxFilesPerJob=self.maxFilesPerJob, maxAttemptsPerFile=self.maxAttemptsPerFile
                    )

                    if not res["OK"]:
                        log.error("Cannot prepare new Jobs", "FTS3Operation %s : %s" % (operation.operationID, res))
                        return operation, res

                    newJobs = res["Value"]

                    log.debug("FTS3Operation %s: %s new jobs to be submitted" % (operation.operationID, len(newJobs)))

                    for ftsJob in newJobs:
                        res = self._serverPolicy.chooseFTS3Server()
                        if not res["OK"]:
                            log.error(res)
                            continue

                        ftsServer = res["Value"]
                        log.debug("Use %s server" % ftsServer)

                        ftsJob.ftsServer = ftsServer

                        res = self.getFTS3Context(ftsJob.username, ftsJob.userGroup, ftsServer, threadID=threadID)

                        if not res["OK"]:
                            log.error("Could not get context", res)
                            continue

                        context = res["Value"]

                        try:
                            tpcProtocols = operation.fts3Plugin.selectTPCProtocols(ftsJob=ftsJob)
                        except ValueError as e:
                            log.error("Could not select TPC list", repr(e))
                            continue

                        res = ftsJob.submit(context=context, protocols=tpcProtocols)

                        if not res["OK"]:
                            log.error(
                                "Could not submit FTS3Job", "FTS3Operation %s : %s" % (operation.operationID, res)
                            )
                            continue

                        operation.ftsJobs.append(ftsJob)

                        submittedFileIds = res["Value"]
                        log.info(
                            "FTS3Operation %s: Submitted job for %s transfers"
                            % (operation.operationID, len(submittedFileIds))
                        )

                # new jobs are put in the DB at the same time
            res = self.fts3db.persistOperation(operation)

            if not res["OK"]:
                log.error("Could not persist operation", res)

            return operation, res

        except Exception as e:
            log.exception("Exception in the thread", repr(e))
            return operation, S_ERROR("Exception %s" % repr(e))
Exemple #27
0
    def _monitorJob(self, ftsJob):
        """ * query the FTS servers
        * update the FTSFile status
        * update the FTSJob status

        :param ftsJob: FTS job

        :return: ftsJob, S_OK()/S_ERROR()
    """
        # General try catch to avoid that the tread dies
        try:
            threadID = current_process().name
            log = gLogger.getSubLogger("_monitorJob/%s" % ftsJob.jobID,
                                       child=True)

            res = self.getFTS3Context(ftsJob.username,
                                      ftsJob.userGroup,
                                      ftsJob.ftsServer,
                                      threadID=threadID)

            if not res['OK']:
                log.error("Error getting context", res)
                return ftsJob, res

            context = res['Value']

            res = ftsJob.monitor(context=context)

            if not res['OK']:
                log.error("Error monitoring job", res)

                # If the job was not found on the server, update the DB
                if cmpError(res, errno.ESRCH):
                    res = self.fts3db.cancelNonExistingJob(
                        ftsJob.operationID, ftsJob.ftsGUID)

                return ftsJob, res

            # { fileID : { Status, Error } }
            filesStatus = res['Value']

            # Specify the job ftsGUID to make sure we do not overwrite
            # status of files already taken by newer jobs
            res = self.fts3db.updateFileStatus(filesStatus,
                                               ftsGUID=ftsJob.ftsGUID)

            if not res['OK']:
                log.error("Error updating file fts status",
                          "%s, %s" % (ftsJob.ftsGUID, res))
                return ftsJob, res

            upDict = {
                ftsJob.jobID: {
                    'status': ftsJob.status,
                    'error': ftsJob.error,
                    'completeness': ftsJob.completeness,
                    'operationID': ftsJob.operationID,
                    'lastMonitor': True,
                }
            }
            res = self.fts3db.updateJobStatus(upDict)

            if ftsJob.status in ftsJob.FINAL_STATES:
                self.__sendAccounting(ftsJob)

            return ftsJob, res

        except Exception as e:
            return ftsJob, S_ERROR(0, "Exception %s" % repr(e))
Exemple #28
0
    def execute(self):
        ''' Main execution method
    '''

        gMonitor.addMark('Iteration', 1)
        # Get all the transformations
        result = self.transClient.getTransformations({
            'Status':
            'Active',
            'Type':
            self.transformationTypes
        })
        if not result['OK']:
            self.log.error(
                "InputDataAgent.execute: Failed to get transformations.",
                result['Message'])
            return S_OK()

        # Process each transformation
        for transDict in result['Value']:
            transID = long(transDict['TransformationID'])
            # res = self.transClient.getTransformationInputDataQuery( transID )
            res = self.transClient.getTransformationMetaQuery(transID, 'Input')
            if not res['OK']:
                if cmpError(res, ENOENT):
                    self.log.info(
                        "InputDataAgent.execute: No input data query found for transformation",
                        transID)
                else:
                    self.log.error(
                        "InputDataAgent.execute: Failed to get input data query",
                        "for %d: %s" % (transID, res['Message']))
                continue
            inputDataQuery = res['Value']

            if self.refreshonly:
                # Determine the correct time stamp to use for this transformation
                if transID in self.timeLog:
                    if transID in self.fullTimeLog:
                        # If it is more than a day since the last reduced query, make a full query just in case
                        if (datetime.datetime.utcnow() -
                                self.fullTimeLog[transID]
                            ) < datetime.timedelta(
                                seconds=self.fullUpdatePeriod):
                            timeStamp = self.timeLog[transID]
                            if self.dateKey:
                                inputDataQuery[self.dateKey] = (
                                    timeStamp - datetime.timedelta(seconds=10)
                                ).strftime('%Y-%m-%d %H:%M:%S')
                            else:
                                self.log.error(
                                    "DateKey was not set in the CS, cannot use the RefreshOnly"
                                )
                        else:
                            self.fullTimeLog[
                                transID] = datetime.datetime.utcnow()
                self.timeLog[transID] = datetime.datetime.utcnow()
                if transID not in self.fullTimeLog:
                    self.fullTimeLog[transID] = datetime.datetime.utcnow()

            # Perform the query to the metadata catalog
            self.log.verbose("Using input data query for transformation",
                             "%d: %s" % (transID, str(inputDataQuery)))
            start = time.time()
            result = self.metadataClient.findFilesByMetadata(inputDataQuery)
            rtime = time.time() - start
            self.log.verbose("Metadata catalog query time",
                             ": %.2f seconds." % (rtime))
            if not result['OK']:
                self.log.error(
                    "InputDataAgent.execute: Failed to get response from the metadata catalog",
                    result['Message'])
                continue
            lfnList = result['Value']

            # Check if the number of files has changed since the last cycle
            nlfns = len(lfnList)
            self.log.info(
                "files returned for transformation from the metadata catalog: ",
                "%d -> %d" % (int(transID), nlfns))
            if nlfns == self.fileLog.get(transID):
                self.log.verbose(
                    'No new files in metadata catalog since last check')
            self.fileLog[transID] = nlfns

            # Add any new files to the transformation
            addedLfns = []
            if lfnList:
                self.log.verbose('Processing lfns for transformation:',
                                 "%d -> %d" % (transID, len(lfnList)))
                # Add the files to the transformation
                self.log.verbose('Adding lfns for transformation:',
                                 "%d -> %d" % (transID, len(lfnList)))
                result = self.transClient.addFilesToTransformation(
                    transID, sorted(lfnList))
                if not result['OK']:
                    self.log.warn(
                        "InputDataAgent.execute: failed to add lfns to transformation",
                        result['Message'])
                    self.fileLog[transID] = 0
                else:
                    if result['Value']['Failed']:
                        for lfn, error in res['Value']['Failed'].items():
                            self.log.warn(
                                "InputDataAgent.execute: Failed to add to transformation:",
                                "%s: %s" % (lfn, error))
                    if result['Value']['Successful']:
                        for lfn, status in result['Value']['Successful'].items(
                        ):
                            if status == 'Added':
                                addedLfns.append(lfn)
                        self.log.info(
                            "InputDataAgent.execute: Added files to transformation",
                            "(%d)" % len(addedLfns))

        return S_OK()
Exemple #29
0
    def transferAndRegisterFile(self,
                                fileName,
                                localPath,
                                lfn,
                                destinationSEList,
                                fileMetaDict,
                                fileCatalog=None,
                                masterCatalogOnly=False):
        """Performs the transfer and register operation with failover.
    """
        errorList = []
        fileGUID = fileMetaDict.get("GUID", None)
        fileChecksum = fileMetaDict.get("Checksum", None)

        for se in destinationSEList:

            # We put here some retry in case the problem comes from the FileCatalog
            # being unavailable. If it is, then the `hasAccess` call would fail,
            # and we would not make any failover request. So the only way is to wait a bit
            # This keeps the WN busy for a while, but at least we do not lose all the processing
            # time we just spent
            for sleeptime in (10, 60, 300, 600):
                self.log.info(
                    "Attempting dm.putAndRegister",
                    "('%s','%s','%s',guid='%s',catalog='%s', checksum = '%s')"
                    %
                    (lfn, localPath, se, fileGUID, fileCatalog, fileChecksum))

                result = DataManager(
                    catalogs=fileCatalog,
                    masterCatalogOnly=masterCatalogOnly).putAndRegister(
                        lfn,
                        localPath,
                        se,
                        guid=fileGUID,
                        checksum=fileChecksum)
                self.log.verbose(result)

                # If the FC is unavailable, we stay in the loop and retry
                # otherwise we continue without retrying
                if result['OK'] or not cmpError(result, EFCERR):
                    break
                self.log.error(
                    "transferAndRegisterFile: FC unavailable, retry")
                time.sleep(sleeptime)

            if not result['OK']:
                self.log.error('dm.putAndRegister failed with message',
                               result['Message'])
                errorList.append(result['Message'])
                continue

            if not result['Value']['Failed']:
                self.log.info(
                    'dm.putAndRegister successfully uploaded and registered',
                    '%s to %s' % (fileName, se))
                return S_OK({'uploadedSE': se, 'lfn': lfn})

            # Now we know something went wrong
            self.log.warn(
                "Didn't manage to do everything, now adding requests for the missing operation"
            )

            errorDict = result['Value']['Failed'][lfn]
            if 'register' not in errorDict:
                self.log.error('dm.putAndRegister failed with unknown error',
                               str(errorDict))
                errorList.append(
                    'Unknown error while attempting upload to %s' % se)
                continue

            # fileDict = errorDict['register']
            # Therefore the registration failed but the upload was successful
            if not fileCatalog:
                fileCatalog = ''

            if masterCatalogOnly:
                fileCatalog = FileCatalog().getMasterCatalogNames()['Value']

            result = self._setRegistrationRequest(lfn, se, fileMetaDict,
                                                  fileCatalog)
            if not result['OK']:
                self.log.error('Failed to set registration request',
                               'SE %s and metadata: \n%s' % (se, fileMetaDict))
                errorList.append(
                    'Failed to set registration request for: SE %s and metadata: \n%s'
                    % (se, fileMetaDict))
                continue
            else:
                self.log.info(
                    'Successfully set registration request',
                    'for: SE %s and metadata: \n%s' % (se, fileMetaDict))
                metadata = {}
                metadata['filedict'] = fileMetaDict
                metadata['uploadedSE'] = se
                metadata['lfn'] = lfn
                metadata['registration'] = 'request'
                return S_OK(metadata)

        self.log.error('Failed to upload output data file',
                       'Encountered %s errors' % len(errorList))
        return S_ERROR('Failed to upload output data file')
Exemple #30
0
def _checkFilesToStage(seToLFNs, onlineLFNs, offlineLFNs, absentLFNs):
    """
  Checks on SEs whether the file is NEARLINE or ONLINE
  onlineLFNs is modified to contain the files found online
  """
    # Only check on storage if it is a tape SE
    failed = {}
    for se, lfnsInSEList in seToLFNs.iteritems():
        seObj = StorageElement(se)
        status = seObj.getStatus()
        if not status['OK']:
            gLogger.error("Could not get SE status",
                          "%s - %s" % (se, status['Message']))
            return status
        tapeSE = status['Value']['TapeSE']
        # File is at a disk SE, no need to stage
        fileMetadata = seObj.getFileMetadata(lfnsInSEList)
        if not fileMetadata['OK']:
            failed[se] = dict.fromkeys(lfnsInSEList, fileMetadata['Message'])
        else:
            if fileMetadata['Value']['Failed']:
                failed[se] = fileMetadata['Value']['Failed']
            # is there at least one replica online?
            for lfn, mDict in fileMetadata['Value']['Successful'].iteritems():
                # SRM returns Cached, but others may only return Accessible
                if mDict.get('Cached', mDict['Accessible']):
                    onlineLFNs.add(lfn)
                elif tapeSE:
                    # A file can be staged only at Tape SE
                    offlineLFNs.setdefault(lfn, []).append(se)
                else:
                    # File not available at a diskSE... we shall retry later
                    pass

    # Doesn't matter if some files are Offline if they are also online
    for lfn in set(offlineLFNs) & onlineLFNs:
        offlineLFNs.pop(lfn)

    # If the file was found staged, ignore possible errors, but print out errors
    for se, failedLfns in failed.items():
        gLogger.error("Errors when getting files metadata", 'at %s' % se)
        for lfn, reason in failedLfns.items():
            if lfn in onlineLFNs:
                gLogger.info('%s: %s, but there is an online replica' %
                             (lfn, reason))
                failed[se].pop(lfn)
            else:
                gLogger.error('%s: %s, no online replicas' % (lfn, reason))
                if cmpError(reason, errno.ENOENT):
                    absentLFNs.setdefault(lfn, []).append(se)
                    failed[se].pop(lfn)
        if not failed[se]:
            failed.pop(se)
    # Find the files that do not exist at SE
    if failed:
        gLogger.error(
            "Error getting metadata", "for %d files" %
            len(set(lfn for lfnList in failed.itervalues()
                    for lfn in lfnList)))

    return S_OK()
Exemple #31
0
    def prepareNewJobs(self, maxFilesPerJob=100, maxAttemptsPerFile=10):

        log = self._log.getSubLogger("_prepareNewJobs", child=True)

        filesToSubmit = self._getFilesToSubmit(
            maxAttemptsPerFile=maxAttemptsPerFile)
        log.debug("%s ftsFiles to submit" % len(filesToSubmit))

        newJobs = []

        # {targetSE : [FTS3Files] }
        res = FTS3Utilities.groupFilesByTarget(filesToSubmit)
        if not res['OK']:
            return res
        filesGroupedByTarget = res['Value']

        for targetSE, ftsFiles in filesGroupedByTarget.items():

            res = self._checkSEAccess(targetSE, 'WriteAccess', vo=self.vo)

            if not res['OK']:
                # If the SE is currently banned, we just skip it
                if cmpError(res, errno.EACCES):
                    log.info(
                        "Write access currently not permitted to %s, skipping."
                        % targetSE)
                else:
                    log.error(res)
                    for ftsFile in ftsFiles:
                        ftsFile.attempt += 1
                continue

            sourceSEs = self.sourceSEs.split(
                ',') if self.sourceSEs is not None else []
            # { sourceSE : [FTSFiles] }
            res = FTS3Utilities.selectUniqueRandomSource(
                ftsFiles, allowedSources=sourceSEs)

            if not res['OK']:
                return res

            uniqueTransfersBySource, failedFiles = res['Value']

            # Treat the errors of the failed files
            for ftsFile, errMsg in failedFiles.items():
                log.error("Error when selecting random sources",
                          "%s, %s" % (ftsFile.lfn, errMsg))
                # If the error is that the file does not exist in the catalog
                # fail it !
                if cmpError(errMsg, errno.ENOENT):
                    log.error("The file does not exist, setting it Defunct",
                              "%s" % ftsFile.lfn)
                    ftsFile.status = 'Defunct'

            # We don't need to check the source, since it is already filtered by the DataManager
            for sourceSE, ftsFiles in uniqueTransfersBySource.items():

                if self.__needsMultiHopStaging(sourceSE, targetSE):
                    log.verbose(
                        "Needs multihop staging, max files per job is 1")
                    maxFilesPerJob = 1

                for ftsFilesChunk in breakListIntoChunks(
                        ftsFiles, maxFilesPerJob):

                    newJob = self._createNewJob('Transfer',
                                                ftsFilesChunk,
                                                targetSE,
                                                sourceSE=sourceSE)

                    newJobs.append(newJob)

        return S_OK(newJobs)
Exemple #32
0
    def transferAndRegisterFile(
        self,
        fileName,
        localPath,
        lfn,
        destinationSEList,
        fileMetaDict,
        fileCatalog=None,
        masterCatalogOnly=False,
        retryUpload=False,
    ):
        """Performs the transfer and register operation with failover.

        :param filename: of absolute no use except for printing logs.
        :param localPath: path to the file locally
        :param lfn: LFN
        :param destinationSEList: list of possible destination for the file.
          Loop over it until one succeeds or we reach the end of it.
        :param fileMetaDict: file metadata for registration
        :param fileCatalog: list of catalogs to use (see :py:class:`DIRAC.DataManagementSystem.Client.DataManager`)
        :param masterCatalogOnly: use only master catalog (see :py:class:`DIRAC.DataManagementSystem.Client.DataManager`)
        :param retryUpload: if set to True, and there is only one output SE in destinationSEList, retry several times.

        """
        errorList = []
        fileGUID = fileMetaDict.get("GUID", None)
        fileChecksum = fileMetaDict.get("Checksum", None)

        for se in destinationSEList:

            # We put here some retry in case the problem comes from the FileCatalog
            # being unavailable. If it is, then the `hasAccess` call would fail,
            # and we would not make any failover request. So the only way is to wait a bit
            # This keeps the WN busy for a while, but at least we do not lose all the processing
            # time we just spent
            # This same retry path is taken if we only have one possible stage out SE
            # and retryUpload is True
            for sleeptime in (10, 60, 300, 600):
                self.log.info(
                    "Attempting dm.putAndRegister",
                    "('%s','%s','%s',guid='%s',catalog='%s', checksum = '%s')"
                    %
                    (lfn, localPath, se, fileGUID, fileCatalog, fileChecksum),
                )

                result = DataManager(
                    catalogs=fileCatalog,
                    masterCatalogOnly=masterCatalogOnly).putAndRegister(
                        lfn,
                        localPath,
                        se,
                        guid=fileGUID,
                        checksum=fileChecksum)
                # retry on any failure
                if result["OK"]:
                    self.log.verbose(result)
                    break
                elif cmpError(result, EFCERR):
                    self.log.debug(
                        "transferAndRegisterFile: FC unavailable, retry")
                elif retryUpload and len(destinationSEList) == 1:
                    self.log.debug(
                        "transferAndRegisterFile: Failed uploading to the only SE, retry"
                    )
                else:
                    self.log.debug(
                        "dm.putAndRegister failed, but move to the next")
                    break
                time.sleep(sleeptime)

            if not result["OK"]:
                self.log.error("dm.putAndRegister failed with message",
                               result["Message"])
                errorList.append(result["Message"])
                continue

            if not result["Value"]["Failed"]:
                self.log.info(
                    "dm.putAndRegister successfully uploaded and registered",
                    "%s to %s" % (fileName, se))
                return S_OK({"uploadedSE": se, "lfn": lfn})

            # Now we know something went wrong
            self.log.warn(
                "Didn't manage to do everything, now adding requests for the missing operation"
            )

            errorDict = result["Value"]["Failed"][lfn]
            if "register" not in errorDict:
                self.log.error("dm.putAndRegister failed with unknown error",
                               str(errorDict))
                errorList.append(
                    "Unknown error while attempting upload to %s" % se)
                continue

            # fileDict = errorDict['register']
            # Therefore the registration failed but the upload was successful
            if not fileCatalog:
                fileCatalog = ""

            if masterCatalogOnly:
                fileCatalog = FileCatalog().getMasterCatalogNames()["Value"]

            result = self._setRegistrationRequest(lfn, se, fileMetaDict,
                                                  fileCatalog)
            if not result["OK"]:
                self.log.error("Failed to set registration request",
                               "SE %s and metadata: \n%s" % (se, fileMetaDict))
                errorList.append(
                    "Failed to set registration request for: SE %s and metadata: \n%s"
                    % (se, fileMetaDict))
                continue
            else:
                self.log.info(
                    "Successfully set registration request",
                    "for: SE %s and metadata: \n%s" % (se, fileMetaDict))
                metadata = {}
                metadata["filedict"] = fileMetaDict
                metadata["uploadedSE"] = se
                metadata["lfn"] = lfn
                metadata["registration"] = "request"
                return S_OK(metadata)

        self.log.error("Failed to upload output data file",
                       "Encountered %s errors" % len(errorList))
        return S_ERROR("Failed to upload output data file")
def getFilesToStage( lfnList, jobState = None, checkOnlyTapeSEs = None, jobLog = None ):
  """ Utility that returns out of a list of LFNs those files that are offline,
      and those for which at least one copy is online
  """
  if not lfnList:
    return S_OK( {'onlineLFNs':[], 'offlineLFNs': {}, 'failedLFNs':[], 'absentLFNs':{}} )

  dm = DataManager()
  if isinstance( lfnList, basestring ):
    lfnList = [lfnList]

  lfnListReplicas = dm.getReplicasForJobs( lfnList, getUrl = False )
  if not lfnListReplicas['OK']:
    return lfnListReplicas

  offlineLFNsDict = {}
  onlineLFNs = {}
  offlineLFNs = {}
  absentLFNs = {}
  failedLFNs = set()
  if lfnListReplicas['Value']['Failed']:
    # Check if files are not existing
    for lfn, reason in lfnListReplicas['Value']['Failed'].iteritems():
      # FIXME: awful check until FC returns a proper error
      if cmpError( reason, errno.ENOENT ) or 'No such file' in reason:
        # The file doesn't exist, job must be Failed
        # FIXME: it is not possible to return here an S_ERROR(), return the message only
        absentLFNs[lfn] = S_ERROR( errno.ENOENT, 'File not in FC' )['Message']
    if absentLFNs:
      return S_OK({'onlineLFNs': list(onlineLFNs),
                   'offlineLFNs': offlineLFNsDict,
                   'failedLFNs': list(failedLFNs),
                   'absentLFNs': absentLFNs})
    return S_ERROR( "Failures in getting replicas" )

  lfnListReplicas = lfnListReplicas['Value']['Successful']
  # If a file is reported here at a tape SE, it is not at a disk SE as we use disk in priority
  # We shall check all file anyway in order to make sure they exist
  seToLFNs = dict()
  for lfn, ses in lfnListReplicas.iteritems():
    for se in ses:
      seToLFNs.setdefault( se, list() ).append( lfn )

  if seToLFNs:
    if jobState:
      # Get user name and group from the job state
      userName = jobState.getAttribute( 'Owner' )
      if not userName[ 'OK' ]:
        return userName
      userName = userName['Value']

      userGroup = jobState.getAttribute( 'OwnerGroup' )
      if not userGroup[ 'OK' ]:
        return userGroup
      userGroup = userGroup['Value']
    else:
      userName = None
      userGroup = None
    # Check whether files are Online or Offline, or missing at SE
    result = _checkFilesToStage( seToLFNs, onlineLFNs, offlineLFNs, absentLFNs,  # pylint: disable=unexpected-keyword-arg
                                 checkOnlyTapeSEs = checkOnlyTapeSEs, jobLog = jobLog,
                                 proxyUserName = userName,
                                 proxyUserGroup = userGroup,
                                 executionLock = True )

    if not result['OK']:
      return result
    failedLFNs = set( lfnList ) - set( onlineLFNs ) - set( offlineLFNs ) - set( absentLFNs )

    # Get the online SEs
    dmsHelper = DMSHelpers()
    onlineSEs = set( se for ses in onlineLFNs.values() for se in ses )
    onlineSites = set( dmsHelper.getLocalSiteForSE( se ).get( 'Value' ) for se in onlineSEs ) - {None}
    for lfn in offlineLFNs:
      ses = offlineLFNs[lfn]
      if len( ses ) == 1:
        # No choice, let's go
        offlineLFNsDict.setdefault( ses[0], list() ).append( lfn )
        continue
      # Try and get an SE at a site already with online files
      found = False
      if onlineSites:
        # If there is at least one online site, select one
        for se in ses:
          site = dmsHelper.getLocalSiteForSE( se )
          if site['OK']:
            if site['Value'] in onlineSites:
              offlineLFNsDict.setdefault( se, list() ).append( lfn )
              found = True
              break
      # No online site found in common, select randomly
      if not found:
        offlineLFNsDict.setdefault( random.choice( ses ), list() ).append( lfn )

  return S_OK({'onlineLFNs': list(onlineLFNs),
               'offlineLFNs': offlineLFNsDict,
               'failedLFNs': list(failedLFNs),
               'absentLFNs': absentLFNs,
               'onlineSites': onlineSites})
  def execute(self):
    """ read requests from RequestClient and enqueue them into ProcessPool """
    gMonitor.addMark("Iteration", 1)
    # # requests (and so tasks) counter
    taskCounter = 0
    while taskCounter < self.__requestsPerCycle:
      self.log.debug("execute: executing %d request in this cycle" % taskCounter)

      requestsToExecute = []

      if not self.__bulkRequest:
        self.log.info("execute: ask for a single request")
        getRequest = self.requestClient().getRequest()
        if not getRequest["OK"]:
          self.log.error("execute:", "%s" % getRequest["Message"])
          break
        if not getRequest["Value"]:
          self.log.info("execute: no more 'Waiting' requests to process")
          break
        requestsToExecute = [getRequest["Value"]]
      else:
        numberOfRequest = min(self.__bulkRequest, self.__requestsPerCycle - taskCounter)
        self.log.info("execute: ask for requests", "%s" % numberOfRequest)
        getRequests = self.requestClient().getBulkRequests(numberOfRequest)
        if not getRequests["OK"]:
          self.log.error("execute:", "%s" % getRequests["Message"])
          break
        if not getRequests["Value"]:
          self.log.info("execute: no more 'Waiting' requests to process")
          break
        for rId in getRequests["Value"]["Failed"]:
          self.log.error("execute:", "%s" % getRequests["Value"]["Failed"][rId])

        requestsToExecute = getRequests["Value"]["Successful"].values()

      self.log.info("execute: will execute requests ", "%s" % len(requestsToExecute))

      for request in requestsToExecute:
        # # set task id
        taskID = request.RequestID

        self.log.info("processPool status", "tasks idle = %s working = %s" %
                      (self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses()))

        looping = 0
        while True:
          if not self.processPool().getFreeSlots():
            if not looping:
              self.log.info(
                  "No free slots available in processPool",
                  "will wait %d seconds to proceed" %
                  self.__poolSleep)
            time.sleep(self.__poolSleep)
            looping += 1
          else:
            if looping:
              self.log.info("Free slot found", "after %d seconds" % looping * self.__poolSleep)
            looping = 0
            # # save current request in cache
            res = self.cacheRequest(request)
            if not res['OK']:
              if cmpError(res, errno.EALREADY):
                # The request is already in the cache, skip it. break out of the while loop to get next request
                break
              # There are too many requests in the cache, commit suicide
              self.log.error(
                  "Too many requests in cache",
                  '(%d requests): put back all requests and exit cycle. Error %s' % (
                      len(
                          self.__requestCache),
                      res['Message']))
              self.putAllRequests()
              return res
            # # serialize to JSON
            result = request.toJSON()
            if not result['OK']:
              continue
            requestJSON = result['Value']
            self.log.info("spawning task for request", "'%s/%s'" % (request.RequestID, request.RequestName))
            timeOut = self.getTimeout(request)
            enqueue = self.processPool().createAndQueueTask(RequestTask,
                                                            kwargs={"requestJSON": requestJSON,
                                                                    "handlersDict": self.handlersDict,
                                                                    "csPath": self.__configPath,
                                                                    "agentName": self.agentName},
                                                            taskID=taskID,
                                                            blocking=True,
                                                            usePoolCallbacks=True,
                                                            timeOut=timeOut)
            if not enqueue["OK"]:
              self.log.error("Could not enqueue task", enqueue["Message"])
            else:
              self.log.debug("successfully enqueued task", "'%s'" % taskID)
              # # update monitor
              gMonitor.addMark("Processed", 1)
              # # update request counter
              taskCounter += 1
              # # task created, a little time kick to proceed
              time.sleep(0.1)
              break

    self.log.info("Flushing callbacks", "(%d requests still in cache)" % len(self.__requestCache))
    processed = self.processPool().processResults()
    # This happens when the result queue is screwed up.
    # Returning S_ERROR proved not to be sufficient,
    # and when in this situation, there is nothing we can do.
    # So we just exit. runit will restart from scratch.
    if processed < 0:
      self.log.fatal("Results queue is screwed up")
      sys.exit(1)
    # # clean return
    return S_OK()
def _checkFilesToStage( seToLFNs, onlineLFNs, offlineLFNs, absentLFNs,
                        checkOnlyTapeSEs = None, jobLog = None,
                        proxyUserName = None,
                        proxyUserGroup = None,
                        executionLock = None ):
  """
  Checks on SEs whether the file is NEARLINE or ONLINE
  onlineLFNs, offlineLFNs and absentLFNs are modified to contain the files found online
  If checkOnlyTapeSEs is True, disk replicas are not checked
  As soon as a replica is found Online for a file, no further check is made
  """
  # Only check on storage if it is a tape SE
  if jobLog is None:
    logger = gLogger
  else:
    logger = jobLog
  if checkOnlyTapeSEs is None:
    # Default value is True
    checkOnlyTapeSEs = True

  failed = {}
  for se, lfnsInSEList in seToLFNs.iteritems():
    # If we have found already all files online at another SE, no need to check the others
    # but still we want to set the SE as Online if not a TapeSE
    vo = getVOForGroup( proxyUserGroup )
    seObj = StorageElement( se, vo = vo )
    status = seObj.getStatus()
    if not status['OK']:
      return status
    tapeSE = status['Value']['TapeSE']
    diskSE = status['Value']['DiskSE']
    # If requested to check only Tape SEs and the file is at a diskSE, we guess it is Online...
    filesToCheck = []
    for lfn in lfnsInSEList:
      # If the file had already been found accessible at an SE, only check that this one is on disk
      diskIsOK = checkOnlyTapeSEs or ( lfn in onlineLFNs )
      if diskIsOK and diskSE:
        onlineLFNs.setdefault( lfn, [] ).append( se )
      elif not diskIsOK or ( tapeSE and ( lfn not in onlineLFNs ) ):
        filesToCheck.append( lfn )
    if not filesToCheck:
      continue

    # We have to use a new SE object because it caches the proxy!
    with UserProxy(proxyUserName=proxyUserName,
                   proxyUserGroup=proxyUserGroup,
                   executionLock=executionLock) as proxyResult:
      if proxyResult['OK']:
        fileMetadata = StorageElement(se, vo=vo).getFileMetadata(filesToCheck)
      else:
        fileMetadata = proxyResult

    if not fileMetadata['OK']:
      failed[se] = dict.fromkeys( filesToCheck, fileMetadata['Message'] )
    else:
      if fileMetadata['Value']['Failed']:
        failed[se] = fileMetadata['Value']['Failed']
      # is there at least one replica online?
      for lfn, mDict in fileMetadata['Value']['Successful'].iteritems():
        # SRM returns Cached, but others may only return Accessible
        if mDict.get( 'Cached', mDict['Accessible'] ):
          onlineLFNs.setdefault( lfn, [] ).append( se )
        elif tapeSE:
          # A file can be staged only at Tape SE
          offlineLFNs.setdefault( lfn, [] ).append( se )
        else:
          # File not available at a diskSE... we shall retry later
          pass

  # Doesn't matter if some files are Offline if they are also online
  for lfn in set( offlineLFNs ) & set( onlineLFNs ):
    offlineLFNs.pop( lfn )

  # If the file was found staged, ignore possible errors, but print out errors
  for se, failedLfns in failed.items():
    logger.error( "Errors when getting files metadata", 'at %s' % se )
    for lfn, reason in failedLfns.items():
      if lfn in onlineLFNs:
        logger.warn( reason, 'for %s, but there is an online replica' % lfn )
        failed[se].pop( lfn )
      else:
        logger.error( reason, 'for %s, no online replicas' % lfn )
        if cmpError( reason, errno.ENOENT ):
          absentLFNs.setdefault( lfn, [] ).append( se )
          failed[se].pop( lfn )
    if not failed[se]:
      failed.pop( se )
  # Find the files that do not exist at SE
  if failed:
    logger.error( "Error getting metadata", "for %d files" % len( set( lfn for lfnList in failed.itervalues() for lfn in lfnList ) ) )

  for lfn in absentLFNs:
    seList = absentLFNs[lfn]
    # FIXME: it is not possible to return here an S_ERROR(), return the message only
    absentLFNs[lfn] = S_ERROR( errno.ENOENT, "File not at %s" % ','.join( seList ) )['Message']
  # Format the error for absent files
  return S_OK()