def test_getFilesToStage(self, _patch, _patched):
        """ Simple test - the StorageElement mock will return all the files online
    """
        res = getFilesToStage([])
        self.assertTrue(res['OK'])
        self.assertEqual(res['Value']['onlineLFNs'], [])
        self.assertEqual(res['Value']['offlineLFNs'], {})

        res = getFilesToStage(['/a/lfn/1.txt'])
        self.assertTrue(res['OK'])
        self.assertEqual(res['Value']['onlineLFNs'],
                         ['/a/lfn/1.txt', '/a/lfn/2.txt'])
        self.assertEqual(res['Value']['offlineLFNs'], {})
Example #2
0
    def test_getFilesToStage(self):
        res = getFilesToStage([])
        self.assert_(res["OK"])
        self.assertEqual(res["Value"]["onlineLFNs"], [])
        self.assertEqual(res["Value"]["offlineLFNs"], {})

        ourSMC = importlib.import_module("DIRAC.StorageManagementSystem.Client.StorageManagerClient")
        ourSMC.DataManager = self.mockDM
        ourSMC.StorageElement = self.mockSE

        res = getFilesToStage(["/a/lfn/1.txt"])
        self.assert_(res["OK"])
        self.assertEqual(res["Value"]["onlineLFNs"], ["/a/lfn/2.txt"])
        self.assert_(res["Value"]["offlineLFNs"], {"SE1": ["/a/lfn/1.txt"]} or {"SE2": ["/a/lfn/1.txt"]})
  def test_getFilesToStage( self ):
    res = getFilesToStage( [] )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value']['onlineLFNs'], [] )
    self.assertEqual( res['Value']['offlineLFNs'], {} )

    ourSMC = importlib.import_module( 'DIRAC.StorageManagementSystem.Client.StorageManagerClient' )
    ourSMC.DataManager = self.mockDM
    ourSMC.StorageElement = self.mockSE


    res = getFilesToStage( ['/a/lfn/1.txt'] )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value']['onlineLFNs'], ['/a/lfn/2.txt'] )
    self.assert_( res['Value']['offlineLFNs'], {'SE1':['/a/lfn/1.txt']} or {'SE2':['/a/lfn/1.txt']} )
 def test_getFilesToStage_tapeSEOnly_1(self, _patch, _patched):
     """Test where the StorageElement will return file is available"""
     res = getFilesToStage(["/a/lfn/2.txt"], checkOnlyTapeSEs=True)
     self.assertTrue(res["OK"])
     self.assertEqual(res["Value"]["onlineLFNs"], ["/a/lfn/2.txt"])
     self.assertEqual(res["Value"]["offlineLFNs"], {})
     self.assertEqual(res["Value"]["absentLFNs"], {})
     self.assertEqual(res["Value"]["failedLFNs"], [])
 def test_getFilesToStage_fileInaccessibleAtDisk(self, _patch, _patched):
     """Test where the StorageElement will return file is unavailable at a Disk SE"""
     res = getFilesToStage(["/a/lfn/1.txt"], checkOnlyTapeSEs=False)
     self.assertTrue(res["OK"])
     self.assertEqual(res["Value"]["onlineLFNs"], [])
     self.assertEqual(res["Value"]["offlineLFNs"], {})
     self.assertEqual(res["Value"]["absentLFNs"], {})
     self.assertEqual(res["Value"]["failedLFNs"], ["/a/lfn/1.txt"])
 def test_getFilesToStage_withFilesToStage(self, _patch, _patched):
     """ Test where the StorageElement mock will return files offline
 """
     res = getFilesToStage(['/a/lfn/1.txt'])
     self.assertTrue(res['OK'])
     self.assertEqual(res['Value']['onlineLFNs'], ['/a/lfn/2.txt'])
     self.assert_(res['Value']['offlineLFNs'], {'SE1': ['/a/lfn/1.txt']}
                  or {'SE2': ['/a/lfn/1.txt']})
 def test_getFilesToStage_noSuchFile( self, _patch, _patched ):
   """ Test where the StorageElement will return file is absent
   """
   res = getFilesToStage( ['/a/lfn/2.txt'], checkOnlyTapeSEs = False )
   self.assertTrue( res['OK'] )
   self.assertEqual( res['Value']['onlineLFNs'], [] )
   self.assertEqual( res['Value']['offlineLFNs'], {} )
   self.assertEqual( res['Value']['absentLFNs'], {'/a/lfn/2.txt': 'No such file or directory ( 2 : File not at SE2)'} )
   self.assertEqual( res['Value']['failedLFNs'], [] )
 def test_getFilesToStage_seErrors(self, _patch, _patched):
     """ Test where the StorageElement will return failure
 """
     res = getFilesToStage(['/a/lfn/2.txt'], checkOnlyTapeSEs=False)
     self.assertTrue(res['OK'])
     self.assertEqual(res['Value']['onlineLFNs'], [])
     self.assertEqual(res['Value']['offlineLFNs'], {})
     self.assertEqual(res['Value']['absentLFNs'], {})
     self.assertEqual(res['Value']['failedLFNs'], ['/a/lfn/2.txt'])
 def test_getFilesToStage_fileInaccessibleAtDisk(self, _patch, _patched):
     """ Test where the StorageElement will return file is unavailable at a Disk SE
 """
     res = getFilesToStage(['/a/lfn/1.txt'], checkOnlyTapeSEs=False)
     self.assertTrue(res['OK'])
     self.assertEqual(res['Value']['onlineLFNs'], [])
     self.assertEqual(res['Value']['offlineLFNs'], {})
     self.assertEqual(res['Value']['absentLFNs'], {})
     self.assertEqual(res['Value']['failedLFNs'], ['/a/lfn/1.txt'])
 def test_getFilesToStage_noSuchFile(self, _patch, _patched):
     """ Test where the StorageElement will return file is absent
 """
     res = getFilesToStage(['/a/lfn/2.txt'])
     self.assertTrue(res['OK'])
     self.assertEqual(res['Value']['onlineLFNs'], [])
     self.assertEqual(res['Value']['offlineLFNs'], {})
     self.assertEqual(res['Value']['absentLFNs'], {'/a/lfn/2.txt': ['SE2']})
     self.assertEqual(res['Value']['failedLFNs'], [])
 def test_getFilesToStage_seErrors( self, _patch, _patched ):
   """ Test where the StorageElement will return failure
   """
   res = getFilesToStage( ['/a/lfn/2.txt'], checkOnlyTapeSEs = False )
   self.assertTrue( res['OK'] )
   self.assertEqual( res['Value']['onlineLFNs'], [] )
   self.assertEqual( res['Value']['offlineLFNs'], {} )
   self.assertEqual( res['Value']['absentLFNs'], {} )
   self.assertEqual( res['Value']['failedLFNs'], ['/a/lfn/2.txt'] )
 def test_getFilesToStage_fileInaccessibleAtDisk( self, _patch, _patched ):
   """ Test where the StorageElement will return file is unavailable at a Disk SE
   """
   res = getFilesToStage( ['/a/lfn/1.txt'], checkOnlyTapeSEs = False )
   self.assertTrue( res['OK'] )
   self.assertEqual( res['Value']['onlineLFNs'], [] )
   self.assertEqual( res['Value']['offlineLFNs'], {} )
   self.assertEqual( res['Value']['absentLFNs'], {} )
   self.assertEqual( res['Value']['failedLFNs'], ['/a/lfn/1.txt'] )
 def test_getFilesToStage_noSuchFile( self, _patch, _patched ):
   """ Test where the StorageElement will return file is absent
   """
   res = getFilesToStage( ['/a/lfn/2.txt'], checkOnlyTapeSEs = False )
   self.assertTrue( res['OK'] )
   self.assertEqual( res['Value']['onlineLFNs'], [] )
   self.assertEqual( res['Value']['offlineLFNs'], {} )
   self.assertEqual( res['Value']['absentLFNs'], {'/a/lfn/2.txt': 'No such file or directory ( 2 : File not at SE2)'} )
   self.assertEqual( res['Value']['failedLFNs'], [] )
 def test_getFilesToStage_withFilesToStage( self, _patch, _patched ):
   """ Test where the StorageElement mock will return files offline
   """
   res = getFilesToStage( ['/a/lfn/1.txt'], checkOnlyTapeSEs = False )
   self.assertTrue( res['OK'] )
   self.assertEqual( res['Value']['onlineLFNs'], [] )
   self.assertIn( res['Value']['offlineLFNs'], [{'SE1':['/a/lfn/1.txt']},
                                                {'SE2':['/a/lfn/1.txt']}] )
   self.assertEqual( res['Value']['absentLFNs'], {} )
   self.assertEqual( res['Value']['failedLFNs'], [] )
 def test_getFilesToStage_withFilesToStage( self, _patch, _patched ):
   """ Test where the StorageElement mock will return files offline
   """
   res = getFilesToStage( ['/a/lfn/1.txt'], checkOnlyTapeSEs = False )
   self.assertTrue( res['OK'] )
   self.assertEqual( res['Value']['onlineLFNs'], [] )
   self.assertIn( res['Value']['offlineLFNs'], [{'SE1':['/a/lfn/1.txt']},
                                                {'SE2':['/a/lfn/1.txt']}] )
   self.assertEqual( res['Value']['absentLFNs'], {} )
   self.assertEqual( res['Value']['failedLFNs'], [] )
  def test_getFilesToStage_tapeSEOnly_2( self, _patch, _patched ):
    """ Test where the StorageElement will return file is at offline at tape
    """

    with patch( "DIRAC.StorageManagementSystem.Client.StorageManagerClient.random.choice", new=MagicMock( return_value='SERandom' )):
      res = getFilesToStage( ['/a/lfn/2.txt'], checkOnlyTapeSEs = True )
    self.assertTrue( res['OK'] )
    self.assertEqual( res['Value']['onlineLFNs'], [] )
    self.assertEqual( res['Value']['offlineLFNs'], {'SERandom': ['/a/lfn/2.txt']} )
    self.assertEqual( res['Value']['absentLFNs'], {} )
    self.assertEqual( res['Value']['failedLFNs'], [] )
 def test_getFilesToStage_noSuchFile(self, _patch, _patched):
     """Test where the StorageElement will return file is absent"""
     res = getFilesToStage(["/a/lfn/2.txt"], checkOnlyTapeSEs=False)
     self.assertTrue(res["OK"])
     self.assertEqual(res["Value"]["onlineLFNs"], [])
     self.assertEqual(res["Value"]["offlineLFNs"], {})
     self.assertEqual(res["Value"]["absentLFNs"], {
         "/a/lfn/2.txt":
         "No such file or directory ( 2 : File not at SE1,SE2)"
     })
     self.assertEqual(res["Value"]["failedLFNs"], [])
 def test_getFilesToStage_withFilesToStage(self, _patch, _patched):
     """Test where the StorageElement mock will return files offline"""
     res = getFilesToStage(["/a/lfn/1.txt"], checkOnlyTapeSEs=False)
     self.assertTrue(res["OK"])
     self.assertEqual(res["Value"]["onlineLFNs"], [])
     self.assertIn(res["Value"]["offlineLFNs"], [{
         "SE1": ["/a/lfn/1.txt"]
     }, {
         "SE2": ["/a/lfn/1.txt"]
     }])
     self.assertEqual(res["Value"]["absentLFNs"], {})
     self.assertEqual(res["Value"]["failedLFNs"], [])
    def test_getFilesToStage_tapeSEOnly_2(self, _patch, _patched):
        """Test where the StorageElement will return file is at offline at tape"""

        with patch(
                "DIRAC.StorageManagementSystem.Client.StorageManagerClient.random.choice",
                new=MagicMock(return_value="SERandom"),
        ):
            res = getFilesToStage(["/a/lfn/2.txt"], checkOnlyTapeSEs=True)
        self.assertTrue(res["OK"])
        self.assertEqual(res["Value"]["onlineLFNs"], [])
        self.assertEqual(res["Value"]["offlineLFNs"],
                         {"SERandom": ["/a/lfn/2.txt"]})
        self.assertEqual(res["Value"]["absentLFNs"], {})
        self.assertEqual(res["Value"]["failedLFNs"], [])
Example #20
0
    def test_getFilesToStage_tapeSEOnly_2(self, _patch, _patched):
        """ Test where the StorageElement will return file is at offline at tape
    """

        with patch(
                "DIRAC.StorageManagementSystem.Client.StorageManagerClient.random.choice",
                new=MagicMock(return_value='SERandom')):
            res = getFilesToStage(['/a/lfn/2.txt'], checkOnlyTapeSEs=True)
        self.assertTrue(res['OK'])
        self.assertEqual(res['Value']['onlineLFNs'], [])
        self.assertEqual(res['Value']['offlineLFNs'],
                         {'SERandom': ['/a/lfn/2.txt']})
        self.assertEqual(res['Value']['absentLFNs'], {})
        self.assertEqual(res['Value']['failedLFNs'], [])
Example #21
0
    def optimizeJob(self, jid, jobState):
        """ 1. Banned sites are removed from the destination list.
        2. Get input files
        3. Production jobs are sent directly to TQ
        4. Check if staging is necessary
    """
        # Reschedule delay
        result = jobState.getAttributes(
            ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'])
        if not result['OK']:
            return result
        attDict = result['Value']
        try:
            reschedules = int(attDict['RescheduleCounter'])
        except (ValueError, KeyError):
            return S_ERROR("RescheduleCounter has to be an integer")
        if reschedules != 0:
            delays = self.ex_getOption('RescheduleDelays', [60, 180, 300, 600])
            delay = delays[min(reschedules, len(delays) - 1)]
            waited = toEpoch() - toEpoch(fromString(attDict['RescheduleTime']))
            if waited < delay:
                return self.__holdJob(
                    jobState, 'On Hold: after rescheduling %s' % reschedules,
                    delay)

        # Get the job manifest for the later checks
        result = jobState.getManifest()
        if not result['OK']:
            return S_ERROR("Could not retrieve job manifest: %s" %
                           result['Message'])
        jobManifest = result['Value']

        # Get site requirements
        result = self.__getSitesRequired(jobManifest)
        if not result['OK']:
            return result
        userSites, userBannedSites = result['Value']

        # Get job type
        result = jobState.getAttribute("JobType")
        if not result['OK']:
            return S_ERROR("Could not retrieve job type")
        jobType = result['Value']

        # Get banned sites from DIRAC
        result = self.siteClient.getSites('Banned')
        if not result['OK']:
            return S_ERROR("Cannot retrieve banned sites from JobDB")
        wmsBannedSites = result['Value']

        # If the user has selected any site, filter them and hold the job if not able to run
        if userSites:
            if jobType not in self.ex_getOption('ExcludedOnHoldJobTypes', []):

                result = self.siteClient.getUsableSites(userSites)
                if not result['OK']:
                    return S_ERROR(
                        "Problem checking userSites for tuple of active/banned/invalid sites"
                    )
                usableSites = set(result['Value'])
                bannedSites = []
                invalidSites = []
                for site in userSites:
                    if site in wmsBannedSites:
                        bannedSites.append(site)
                    elif site not in usableSites:
                        invalidSites.append(site)

                if invalidSites:
                    self.jobLog.debug("Invalid site(s) requested: %s" %
                                      ','.join(invalidSites))
                    if not self.ex_getOption('AllowInvalidSites', True):
                        return self.__holdJob(
                            jobState, "Requested site(s) %s are invalid" %
                            ",".join(invalidSites))
                if bannedSites:
                    self.jobLog.debug("Banned site(s) %s ignored" %
                                      ",".join(bannedSites))
                    if not usableSites:
                        return self.__holdJob(
                            jobState, "Requested site(s) %s are inactive" %
                            ",".join(bannedSites))

                if not usableSites:
                    return self.__holdJob(
                        jobState, "No requested site(s) are active/valid")
                userSites = list(usableSites)

        checkPlatform = self.ex_getOption('CheckPlatform', False)
        jobPlatform = jobManifest.getOption("Platform", None)
        # First check that the platform is valid (in OSCompatibility list)
        if checkPlatform and jobPlatform:
            result = gConfig.getOptionsDict(
                '/Resources/Computing/OSCompatibility')
            if not result['OK']:
                return S_ERROR("Unable to get OSCompatibility list")
            allPlatforms = result['Value']
            if jobPlatform not in allPlatforms:
                self.jobLog.error("Platform not supported", jobPlatform)
                return S_ERROR("Platform %s is not supported" % jobPlatform)

        # Filter the userSites by the platform selection (if there is one)
        if checkPlatform and userSites:
            if jobPlatform:
                result = self.__filterByPlatform(jobPlatform, userSites)
                if not result['OK']:
                    self.jobLog.error("Failed to filter job sites by platform",
                                      result['Message'])
                    return S_ERROR("Failed to filter job sites by platform")
                userSites = result['Value']
                if not userSites:
                    # No sites left after filtering -> Invalid platform/sites combination
                    self.jobLog.error("No selected sites match platform",
                                      jobPlatform)
                    return S_ERROR("No selected sites match platform '%s'" %
                                   jobPlatform)

        # Check if there is input data
        result = jobState.getInputData()
        if not result['OK']:
            self.jobLog.error("Cannot get input data", result['Message'])
            return S_ERROR("Failed to get input data from JobDB")

        if not result['Value']:
            # No input data? Just send to TQ
            return self.__sendToTQ(jobState, jobManifest, userSites,
                                   userBannedSites)

        self.jobLog.verbose("Has an input data requirement")
        inputData = result['Value']

        # ===================================================================================
        # Production jobs are sent to TQ, but first we have to verify if staging is necessary
        # ===================================================================================
        if jobType in Operations().getValue('Transformations/DataProcessing',
                                            []):
            self.jobLog.info(
                "Production job: sending to TQ, but first checking if staging is requested"
            )

            res = getFilesToStage(inputData,
                                  jobState=jobState,
                                  checkOnlyTapeSEs=self.ex_getOption(
                                      'CheckOnlyTapeSEs', True),
                                  jobLog=self.jobLog)

            if not res['OK']:
                return self.__holdJob(jobState, res['Message'])
            if res['Value']['absentLFNs']:
                # Some files do not exist at all... set the job Failed
                # Reverse errors
                reasons = {}
                for lfn, reason in res['Value']['absentLFNs'].iteritems():
                    reasons.setdefault(reason, []).append(lfn)
                for reason, lfns in reasons.iteritems():
                    # Some files are missing in the FC or in SEs, fail the job
                    self.jobLog.error(reason, ','.join(lfns))
                error = ','.join(reasons)
                return S_ERROR(error)

            if res['Value']['failedLFNs']:
                return self.__holdJob(
                    jobState, "Couldn't get storage metadata of some files")
            stageLFNs = res['Value']['offlineLFNs']
            if stageLFNs:
                res = self.__checkStageAllowed(jobState)
                if not res['OK']:
                    return res
                if not res['Value']:
                    return S_ERROR("Stage not allowed")
                self.__requestStaging(jobState, stageLFNs)
                return S_OK()
            else:
                # No staging required
                onlineSites = res['Value']['onlineSites']
                if onlineSites:
                    # Set the online site(s) first
                    userSites = set(userSites)
                    onlineSites &= userSites
                    userSites = list(onlineSites) + list(userSites -
                                                         onlineSites)
                return self.__sendToTQ(jobState,
                                       jobManifest,
                                       userSites,
                                       userBannedSites,
                                       onlineSites=onlineSites)

        # ===================================================
        # From now on we know it's a user job with input data
        # ===================================================

        idAgent = self.ex_getOption('InputDataAgent', 'InputData')
        result = self.retrieveOptimizerParam(idAgent)
        if not result['OK']:
            self.jobLog.error("Could not retrieve input data info",
                              result['Message'])
            return S_ERROR("Could not retrieve input data info")
        opData = result['Value']

        if 'SiteCandidates' not in opData:
            return S_ERROR("No possible site candidates")

        # Filter input data sites with user requirement
        siteCandidates = list(opData['SiteCandidates'])
        self.jobLog.info("Site candidates are %s" % siteCandidates)

        if userSites:
            siteCandidates = list(set(siteCandidates) & set(userSites))

        siteCandidates = self._applySiteFilter(siteCandidates,
                                               banned=userBannedSites)
        if not siteCandidates:
            return S_ERROR("Impossible InputData * Site requirements")

        idSites = {}
        for site in siteCandidates:
            idSites[site] = opData['SiteCandidates'][site]

        # Check if sites have correct count of disk+tape replicas
        numData = len(inputData)
        errorSites = set()
        for site in idSites:
            if numData != idSites[site]['disk'] + idSites[site]['tape']:
                self.jobLog.error(
                    "Site candidate %s does not have all the input data" %
                    site)
                errorSites.add(site)
        for site in errorSites:
            idSites.pop(site)
        if not idSites:
            return S_ERROR("Site candidates do not have all the input data")

        # Check if staging is required
        stageRequired, siteCandidates = self.__resolveStaging(
            inputData, idSites)
        if not siteCandidates:
            return S_ERROR("No destination sites available")

        # Is any site active?
        stageSites = self._applySiteFilter(siteCandidates,
                                           banned=wmsBannedSites)
        if not stageSites:
            return self.__holdJob(
                jobState,
                "Sites %s are inactive or banned" % ", ".join(siteCandidates))

        # If no staging is required send to TQ
        if not stageRequired:
            # Use siteCandidates and not stageSites because active and banned sites
            # will be taken into account on matching time
            return self.__sendToTQ(jobState, jobManifest, siteCandidates,
                                   userBannedSites)

        # Check if the user is allowed to stage
        if self.ex_getOption("RestrictDataStage", False):
            res = self.__checkStageAllowed(jobState)
            if not res['OK']:
                return res
            if not res['Value']:
                return S_ERROR("Stage not allowed")

        # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
        stageSite = stageSites[0]
        self.jobLog.verbose(" Staging site will be %s" % (stageSite))
        stageData = idSites[stageSite]
        # Set as if everything has already been staged
        stageData['disk'] += stageData['tape']
        stageData['tape'] = 0
        # Set the site info back to the original dict to save afterwards
        opData['SiteCandidates'][stageSite] = stageData

        stageRequest = self.__preRequestStaging(jobManifest, stageSite, opData)
        if not stageRequest['OK']:
            return stageRequest
        stageLFNs = stageRequest['Value']
        result = self.__requestStaging(jobState, stageLFNs)
        if not result['OK']:
            return result
        stageLFNs = result['Value']
        self.__updateSharedSESites(jobManifest, stageSite, stageLFNs, opData)
        # Save the optimizer data again
        self.jobLog.verbose('Updating %s Optimizer Info:' % (idAgent), opData)
        result = self.storeOptimizerParam(idAgent, opData)
        if not result['OK']:
            return result

        return self.__setJobSite(jobState, stageSites)
Example #22
0
  def optimizeJob( self, jid, jobState ):
    """ 1. Banned sites are removed from the destination list.
        2. Get input files
        3. Production jobs are sent directly to TQ
        4. Check if staging is necessary
    """
    # Reschedule delay
    result = jobState.getAttributes( [ 'RescheduleCounter', 'RescheduleTime', 'ApplicationStatus' ] )
    if not result[ 'OK' ]:
      return result
    attDict = result[ 'Value' ]
    try:
      reschedules = int( attDict[ 'RescheduleCounter' ] )
    except ( ValueError, KeyError ):
      return S_ERROR( "RescheduleCounter has to be an integer" )
    if reschedules != 0:
      delays = self.ex_getOption( 'RescheduleDelays', [60, 180, 300, 600] )
      delay = delays[ min( reschedules, len( delays ) - 1 ) ]
      waited = toEpoch() - toEpoch( fromString( attDict[ 'RescheduleTime' ] ) )
      if waited < delay:
        return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay )

    # Get site requirements
    result = self.__getSitesRequired( jobState )
    if not result[ 'OK' ]:
      return result
    userSites, userBannedSites = result[ 'Value' ]

    # Get job type
    result = jobState.getAttribute( "JobType" )
    if not result[ 'OK' ]:
      return S_ERROR( "Could not retrieve job type" )
    jobType = result[ 'Value' ]

    # Get banned sites from DIRAC
    result = self.__jobDB.getSiteMask( 'Banned' )
    if not result[ 'OK' ]:
      return S_ERROR( "Cannot retrieve banned sites from JobDB" )
    wmsBannedSites = result[ 'Value' ]

    # If the user has selected any site, filter them and hold the job if not able to run
    if userSites:
      if jobType not in self.ex_getOption( 'ExcludedOnHoldJobTypes', [] ):
        result = self.__jobDB.getUserSitesTuple( userSites )
        if not result[ 'OK' ]:
          return S_ERROR( "Problem checking userSites for tuple of active/banned/invalid sites" )

        userSites, bannedSites, invalidSites = result['Value']
        if invalidSites:
          self.jobLog.debug( "Invalid site(s) requested: %s" % ','.join( invalidSites ) )
          if not self.ex_getOption( 'AllowInvalidSites', True ):
            return self.__holdJob( jobState, "Requested site(s) %s are invalid" % ",".join( invalidSites ) )
        if bannedSites:
          self.jobLog.debug( "Banned site(s) %s ignored" % ",".join( bannedSites ) )
          if not userSites:
            return self.__holdJob( jobState, "Requested site(s) %s are inactive" % ",".join( bannedSites ) )

        if not userSites:
          return self.__holdJob( jobState, "No requested site(s) are active/valid" )
        userSites = list(userSites)

    # Check if there is input data
    result = jobState.getInputData()
    if not result['OK']:
      self.jobLog.error( "Cannot get input data %s" % ( result['Message'] ) )
      return S_ERROR( "Failed to get input data from JobDB" )

    if not result['Value']:
      # No input data? Just send to TQ
      return self.__sendToTQ( jobState, userSites, userBannedSites )

    self.jobLog.verbose( "Has an input data requirement" )
    inputData = result[ 'Value' ]

    # Production jobs are sent to TQ, but first we have to verify if staging is necessary
    if jobType in Operations().getValue( 'Transformations/DataProcessing', [] ):
      self.jobLog.info( "Production job: sending to TQ, but first checking if staging is requested" )

      userName = jobState.getAttribute( 'Owner' )
      if not userName[ 'OK' ]:
        return userName
      userName = userName['Value']

      userGroup = jobState.getAttribute( 'OwnerGroup' )
      if not userGroup[ 'OK' ]:
        return userGroup
      userGroup = userGroup['Value']

      res = getFilesToStage( inputData, proxyUserName = userName, proxyUserGroup = userGroup ) #pylint: disable=unexpected-keyword-arg

      if not res['OK']:
        return self.__holdJob( jobState, res['Message'] )
      stageLFNs = res['Value']['offlineLFNs']
      if stageLFNs:
        res = self.__checkStageAllowed( jobState )
        if not res['OK']:
          return res
        if not res['Value']:
          return S_ERROR( "Stage not allowed" )
        self.__requestStaging( jobState, stageLFNs )
        return S_OK()
      else:
        return self.__sendToTQ( jobState, userSites, userBannedSites )

    # From now on we know it's a user job with input data

    idAgent = self.ex_getOption( 'InputDataAgent', 'InputData' )
    result = self.retrieveOptimizerParam( idAgent )
    if not result['OK']:
      self.jobLog.error( "Could not retrieve input data info", result[ 'Message' ] )
      return S_ERROR( "Could not retrieve input data info" )
    opData = result[ 'Value' ]

    if 'SiteCandidates' not in opData:
      return S_ERROR( "No possible site candidates" )

    # Filter input data sites with user requirement
    siteCandidates = list( opData[ 'SiteCandidates' ] )
    self.jobLog.info( "Site candidates are %s" % siteCandidates )

    if userSites:
      siteCandidates = list( set( siteCandidates ) & set( userSites ) )

    siteCandidates = self._applySiteFilter( siteCandidates, banned = userBannedSites )
    if not siteCandidates:
      return S_ERROR( "Impossible InputData * Site requirements" )

    idSites = {}
    for site in siteCandidates:
      idSites[ site ] = opData[ 'SiteCandidates' ][ site ]

    # Check if sites have correct count of disk+tape replicas
    numData = len( inputData )
    errorSites = set()
    for site in idSites:
      if numData != idSites[ site ][ 'disk' ] + idSites[ site ][ 'tape' ]:
        self.jobLog.error( "Site candidate %s does not have all the input data" % site )
        errorSites.add( site )
    for site in errorSites:
      idSites.pop( site )
    if not idSites:
      return S_ERROR( "Site candidates do not have all the input data" )

    # Check if staging is required
    stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites )
    if not siteCandidates:
      return S_ERROR( "No destination sites available" )

    # Is any site active?
    stageSites = self._applySiteFilter( siteCandidates, banned = wmsBannedSites )
    if not stageSites:
      return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( siteCandidates ) )

    # If no staging is required send to TQ
    if not stageRequired:
      # Use siteCandidates and not stageSites because active and banned sites
      # will be taken into account on matching time
      return self.__sendToTQ( jobState, siteCandidates, userBannedSites )

    # Check if the user is allowed to stage
    if self.ex_getOption( "RestrictDataStage", False ):
      res = self.__checkStageAllowed( jobState )
      if not res['OK']:
        return res
      if not res['Value']:
        return S_ERROR( "Stage not allowed" )

    # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
    stageSite = stageSites[0]
    self.jobLog.verbose( " Staging site will be %s" % ( stageSite ) )
    stageData = idSites[ stageSite ]
    # Set as if everything has already been staged
    stageData[ 'disk' ] += stageData[ 'tape' ]
    stageData[ 'tape' ] = 0
    # Set the site info back to the original dict to save afterwards
    opData[ 'SiteCandidates' ][ stageSite ] = stageData

    stageRequest = self.__preRequestStaging( jobState, stageSite, opData )
    if not stageRequest['OK']:
      return stageRequest
    stageLFNs = stageRequest['Value']
    result = self.__requestStaging( jobState, stageLFNs )
    if not result[ 'OK' ]:
      return result
    stageLFNs = result[ 'Value' ]
    self.__updateSharedSESites( jobState, stageSite, stageLFNs, opData )
    # Save the optimizer data again
    self.jobLog.verbose( 'Updating %s Optimizer Info:' % ( idAgent ), opData )
    result = self.storeOptimizerParam( idAgent, opData )
    if not result[ 'OK' ]:
      return result

    return self.__setJobSite( jobState, stageSites )
Example #23
0
  def optimizeJob(self, jid, jobState):
    """ 1. Banned sites are removed from the destination list.
        2. Get input files
        3. Production jobs are sent directly to TQ
        4. Check if staging is necessary
    """
    # Reschedule delay
    result = jobState.getAttributes(['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'])
    if not result['OK']:
      return result
    attDict = result['Value']
    try:
      reschedules = int(attDict['RescheduleCounter'])
    except (ValueError, KeyError):
      return S_ERROR("RescheduleCounter has to be an integer")
    if reschedules != 0:
      delays = self.ex_getOption('RescheduleDelays', [60, 180, 300, 600])
      delay = delays[min(reschedules, len(delays) - 1)]
      waited = toEpoch() - toEpoch(fromString(attDict['RescheduleTime']))
      if waited < delay:
        return self.__holdJob(jobState, 'On Hold: after rescheduling %s' % reschedules, delay)

    # Get the job manifest for the later checks
    result = jobState.getManifest()
    if not result['OK']:
      return S_ERROR("Could not retrieve job manifest: %s" % result['Message'])
    jobManifest = result['Value']

    # Get site requirements
    result = self.__getSitesRequired(jobManifest)
    if not result['OK']:
      return result
    userSites, userBannedSites = result['Value']

    # Get job type
    result = jobState.getAttribute("JobType")
    if not result['OK']:
      return S_ERROR("Could not retrieve job type")
    jobType = result['Value']

    # Get banned sites from DIRAC
    result = self.siteClient.getSites('Banned')
    if not result['OK']:
      return S_ERROR("Cannot retrieve banned sites from JobDB")
    wmsBannedSites = result['Value']

    # If the user has selected any site, filter them and hold the job if not able to run
    if userSites:
      if jobType not in self.ex_getOption('ExcludedOnHoldJobTypes', []):

        result = self.siteClient.getUsableSites(userSites)
        if not result['OK']:
          return S_ERROR("Problem checking userSites for tuple of active/banned/invalid sites")
        usableSites = set(result['Value'])
        bannedSites = []
        invalidSites = []
        for site in userSites:
          if site in wmsBannedSites:
            bannedSites.append(site)
          elif site not in usableSites:
            invalidSites.append(site)

        if invalidSites:
          self.jobLog.debug("Invalid site(s) requested: %s" % ','.join(invalidSites))
          if not self.ex_getOption('AllowInvalidSites', True):
            return self.__holdJob(jobState, "Requested site(s) %s are invalid" % ",".join(invalidSites))
        if bannedSites:
          self.jobLog.debug("Banned site(s) %s ignored" % ",".join(bannedSites))
          if not usableSites:
            return self.__holdJob(jobState, "Requested site(s) %s are inactive" % ",".join(bannedSites))

        if not usableSites:
          return self.__holdJob(jobState, "No requested site(s) are active/valid")
        userSites = list(usableSites)

    checkPlatform = self.ex_getOption('CheckPlatform', False)
    jobPlatform = jobManifest.getOption("Platform", None)
    # First check that the platform is valid (in OSCompatibility list)
    if checkPlatform and jobPlatform:
      result = gConfig.getOptionsDict('/Resources/Computing/OSCompatibility')
      if not result['OK']:
        return S_ERROR("Unable to get OSCompatibility list")
      allPlatforms = result['Value']
      if jobPlatform not in allPlatforms:
        self.jobLog.error("Platform %s is not supported" % jobPlatform)
        return S_ERROR("Platform %s is not supported" % jobPlatform)

    # Filter the userSites by the platform selection (if there is one)
    if checkPlatform and userSites:
      if jobPlatform:
        result = self.__filterByPlatform(jobPlatform, userSites)
        if not result['OK']:
          self.jobLog.error("Failed to filter job sites by platform: %s" % result['Message'])
          return S_ERROR("Failed to filter job sites by platform")
        userSites = result['Value']
        if not userSites:
          # No sites left after filtering -> Invalid platform/sites combination
          self.jobLog.error("No selected sites match platform '%s'" % jobPlatform)
          return S_ERROR("No selected sites match platform '%s'" % jobPlatform)

    # Check if there is input data
    result = jobState.getInputData()
    if not result['OK']:
      self.jobLog.error("Cannot get input data %s" % (result['Message']))
      return S_ERROR("Failed to get input data from JobDB")

    if not result['Value']:
      # No input data? Just send to TQ
      return self.__sendToTQ(jobState, jobManifest, userSites, userBannedSites)

    self.jobLog.verbose("Has an input data requirement")
    inputData = result['Value']

    # ===================================================================================
    # Production jobs are sent to TQ, but first we have to verify if staging is necessary
    # ===================================================================================
    if jobType in Operations().getValue('Transformations/DataProcessing', []):
      self.jobLog.info("Production job: sending to TQ, but first checking if staging is requested")

      res = getFilesToStage(inputData,
                            jobState=jobState,
                            checkOnlyTapeSEs=self.ex_getOption('CheckOnlyTapeSEs', True),
                            jobLog=self.jobLog)

      if not res['OK']:
        return self.__holdJob(jobState, res['Message'])
      if res['Value']['absentLFNs']:
        # Some files do not exist at all... set the job Failed
        # Reverse errors
        reasons = {}
        for lfn, reason in res['Value']['absentLFNs'].iteritems():
          reasons.setdefault(reason, []).append(lfn)
        for reason, lfns in reasons.iteritems():
          # Some files are missing in the FC or in SEs, fail the job
          self.jobLog.error(reason, ','.join(lfns))
        error = ','.join(reasons)
        return S_ERROR(error)

      if res['Value']['failedLFNs']:
        return self.__holdJob(jobState, "Couldn't get storage metadata of some files")
      stageLFNs = res['Value']['offlineLFNs']
      if stageLFNs:
        res = self.__checkStageAllowed(jobState)
        if not res['OK']:
          return res
        if not res['Value']:
          return S_ERROR("Stage not allowed")
        self.__requestStaging(jobState, stageLFNs)
        return S_OK()
      else:
        # No staging required
        onlineSites = res['Value']['onlineSites']
        if onlineSites:
          # Set the online site(s) first
          userSites = set(userSites)
          onlineSites &= userSites
          userSites = list(onlineSites) + list(userSites - onlineSites)
        return self.__sendToTQ(jobState, jobManifest, userSites, userBannedSites, onlineSites=onlineSites)

    # ===================================================
    # From now on we know it's a user job with input data
    # ===================================================

    idAgent = self.ex_getOption('InputDataAgent', 'InputData')
    result = self.retrieveOptimizerParam(idAgent)
    if not result['OK']:
      self.jobLog.error("Could not retrieve input data info", result['Message'])
      return S_ERROR("Could not retrieve input data info")
    opData = result['Value']

    if 'SiteCandidates' not in opData:
      return S_ERROR("No possible site candidates")

    # Filter input data sites with user requirement
    siteCandidates = list(opData['SiteCandidates'])
    self.jobLog.info("Site candidates are %s" % siteCandidates)

    if userSites:
      siteCandidates = list(set(siteCandidates) & set(userSites))

    siteCandidates = self._applySiteFilter(siteCandidates, banned=userBannedSites)
    if not siteCandidates:
      return S_ERROR("Impossible InputData * Site requirements")

    idSites = {}
    for site in siteCandidates:
      idSites[site] = opData['SiteCandidates'][site]

    # Check if sites have correct count of disk+tape replicas
    numData = len(inputData)
    errorSites = set()
    for site in idSites:
      if numData != idSites[site]['disk'] + idSites[site]['tape']:
        self.jobLog.error("Site candidate %s does not have all the input data" % site)
        errorSites.add(site)
    for site in errorSites:
      idSites.pop(site)
    if not idSites:
      return S_ERROR("Site candidates do not have all the input data")

    # Check if staging is required
    stageRequired, siteCandidates = self.__resolveStaging(inputData, idSites)
    if not siteCandidates:
      return S_ERROR("No destination sites available")

    # Is any site active?
    stageSites = self._applySiteFilter(siteCandidates, banned=wmsBannedSites)
    if not stageSites:
      return self.__holdJob(jobState, "Sites %s are inactive or banned" % ", ".join(siteCandidates))

    # If no staging is required send to TQ
    if not stageRequired:
      # Use siteCandidates and not stageSites because active and banned sites
      # will be taken into account on matching time
      return self.__sendToTQ(jobState, jobManifest, siteCandidates, userBannedSites)

    # Check if the user is allowed to stage
    if self.ex_getOption("RestrictDataStage", False):
      res = self.__checkStageAllowed(jobState)
      if not res['OK']:
        return res
      if not res['Value']:
        return S_ERROR("Stage not allowed")

    # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
    stageSite = stageSites[0]
    self.jobLog.verbose(" Staging site will be %s" % (stageSite))
    stageData = idSites[stageSite]
    # Set as if everything has already been staged
    stageData['disk'] += stageData['tape']
    stageData['tape'] = 0
    # Set the site info back to the original dict to save afterwards
    opData['SiteCandidates'][stageSite] = stageData

    stageRequest = self.__preRequestStaging(jobManifest, stageSite, opData)
    if not stageRequest['OK']:
      return stageRequest
    stageLFNs = stageRequest['Value']
    result = self.__requestStaging(jobState, stageLFNs)
    if not result['OK']:
      return result
    stageLFNs = result['Value']
    self.__updateSharedSESites(jobManifest, stageSite, stageLFNs, opData)
    # Save the optimizer data again
    self.jobLog.verbose('Updating %s Optimizer Info:' % (idAgent), opData)
    result = self.storeOptimizerParam(idAgent, opData)
    if not result['OK']:
      return result

    return self.__setJobSite(jobState, stageSites)