def test_getFilesToStage(self, _patch, _patched): """ Simple test - the StorageElement mock will return all the files online """ res = getFilesToStage([]) self.assertTrue(res['OK']) self.assertEqual(res['Value']['onlineLFNs'], []) self.assertEqual(res['Value']['offlineLFNs'], {}) res = getFilesToStage(['/a/lfn/1.txt']) self.assertTrue(res['OK']) self.assertEqual(res['Value']['onlineLFNs'], ['/a/lfn/1.txt', '/a/lfn/2.txt']) self.assertEqual(res['Value']['offlineLFNs'], {})
def test_getFilesToStage(self): res = getFilesToStage([]) self.assert_(res["OK"]) self.assertEqual(res["Value"]["onlineLFNs"], []) self.assertEqual(res["Value"]["offlineLFNs"], {}) ourSMC = importlib.import_module("DIRAC.StorageManagementSystem.Client.StorageManagerClient") ourSMC.DataManager = self.mockDM ourSMC.StorageElement = self.mockSE res = getFilesToStage(["/a/lfn/1.txt"]) self.assert_(res["OK"]) self.assertEqual(res["Value"]["onlineLFNs"], ["/a/lfn/2.txt"]) self.assert_(res["Value"]["offlineLFNs"], {"SE1": ["/a/lfn/1.txt"]} or {"SE2": ["/a/lfn/1.txt"]})
def test_getFilesToStage( self ): res = getFilesToStage( [] ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['onlineLFNs'], [] ) self.assertEqual( res['Value']['offlineLFNs'], {} ) ourSMC = importlib.import_module( 'DIRAC.StorageManagementSystem.Client.StorageManagerClient' ) ourSMC.DataManager = self.mockDM ourSMC.StorageElement = self.mockSE res = getFilesToStage( ['/a/lfn/1.txt'] ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['onlineLFNs'], ['/a/lfn/2.txt'] ) self.assert_( res['Value']['offlineLFNs'], {'SE1':['/a/lfn/1.txt']} or {'SE2':['/a/lfn/1.txt']} )
def test_getFilesToStage_tapeSEOnly_1(self, _patch, _patched): """Test where the StorageElement will return file is available""" res = getFilesToStage(["/a/lfn/2.txt"], checkOnlyTapeSEs=True) self.assertTrue(res["OK"]) self.assertEqual(res["Value"]["onlineLFNs"], ["/a/lfn/2.txt"]) self.assertEqual(res["Value"]["offlineLFNs"], {}) self.assertEqual(res["Value"]["absentLFNs"], {}) self.assertEqual(res["Value"]["failedLFNs"], [])
def test_getFilesToStage_fileInaccessibleAtDisk(self, _patch, _patched): """Test where the StorageElement will return file is unavailable at a Disk SE""" res = getFilesToStage(["/a/lfn/1.txt"], checkOnlyTapeSEs=False) self.assertTrue(res["OK"]) self.assertEqual(res["Value"]["onlineLFNs"], []) self.assertEqual(res["Value"]["offlineLFNs"], {}) self.assertEqual(res["Value"]["absentLFNs"], {}) self.assertEqual(res["Value"]["failedLFNs"], ["/a/lfn/1.txt"])
def test_getFilesToStage_withFilesToStage(self, _patch, _patched): """ Test where the StorageElement mock will return files offline """ res = getFilesToStage(['/a/lfn/1.txt']) self.assertTrue(res['OK']) self.assertEqual(res['Value']['onlineLFNs'], ['/a/lfn/2.txt']) self.assert_(res['Value']['offlineLFNs'], {'SE1': ['/a/lfn/1.txt']} or {'SE2': ['/a/lfn/1.txt']})
def test_getFilesToStage_noSuchFile( self, _patch, _patched ): """ Test where the StorageElement will return file is absent """ res = getFilesToStage( ['/a/lfn/2.txt'], checkOnlyTapeSEs = False ) self.assertTrue( res['OK'] ) self.assertEqual( res['Value']['onlineLFNs'], [] ) self.assertEqual( res['Value']['offlineLFNs'], {} ) self.assertEqual( res['Value']['absentLFNs'], {'/a/lfn/2.txt': 'No such file or directory ( 2 : File not at SE2)'} ) self.assertEqual( res['Value']['failedLFNs'], [] )
def test_getFilesToStage_seErrors(self, _patch, _patched): """ Test where the StorageElement will return failure """ res = getFilesToStage(['/a/lfn/2.txt'], checkOnlyTapeSEs=False) self.assertTrue(res['OK']) self.assertEqual(res['Value']['onlineLFNs'], []) self.assertEqual(res['Value']['offlineLFNs'], {}) self.assertEqual(res['Value']['absentLFNs'], {}) self.assertEqual(res['Value']['failedLFNs'], ['/a/lfn/2.txt'])
def test_getFilesToStage_fileInaccessibleAtDisk(self, _patch, _patched): """ Test where the StorageElement will return file is unavailable at a Disk SE """ res = getFilesToStage(['/a/lfn/1.txt'], checkOnlyTapeSEs=False) self.assertTrue(res['OK']) self.assertEqual(res['Value']['onlineLFNs'], []) self.assertEqual(res['Value']['offlineLFNs'], {}) self.assertEqual(res['Value']['absentLFNs'], {}) self.assertEqual(res['Value']['failedLFNs'], ['/a/lfn/1.txt'])
def test_getFilesToStage_noSuchFile(self, _patch, _patched): """ Test where the StorageElement will return file is absent """ res = getFilesToStage(['/a/lfn/2.txt']) self.assertTrue(res['OK']) self.assertEqual(res['Value']['onlineLFNs'], []) self.assertEqual(res['Value']['offlineLFNs'], {}) self.assertEqual(res['Value']['absentLFNs'], {'/a/lfn/2.txt': ['SE2']}) self.assertEqual(res['Value']['failedLFNs'], [])
def test_getFilesToStage_seErrors( self, _patch, _patched ): """ Test where the StorageElement will return failure """ res = getFilesToStage( ['/a/lfn/2.txt'], checkOnlyTapeSEs = False ) self.assertTrue( res['OK'] ) self.assertEqual( res['Value']['onlineLFNs'], [] ) self.assertEqual( res['Value']['offlineLFNs'], {} ) self.assertEqual( res['Value']['absentLFNs'], {} ) self.assertEqual( res['Value']['failedLFNs'], ['/a/lfn/2.txt'] )
def test_getFilesToStage_fileInaccessibleAtDisk( self, _patch, _patched ): """ Test where the StorageElement will return file is unavailable at a Disk SE """ res = getFilesToStage( ['/a/lfn/1.txt'], checkOnlyTapeSEs = False ) self.assertTrue( res['OK'] ) self.assertEqual( res['Value']['onlineLFNs'], [] ) self.assertEqual( res['Value']['offlineLFNs'], {} ) self.assertEqual( res['Value']['absentLFNs'], {} ) self.assertEqual( res['Value']['failedLFNs'], ['/a/lfn/1.txt'] )
def test_getFilesToStage_withFilesToStage( self, _patch, _patched ): """ Test where the StorageElement mock will return files offline """ res = getFilesToStage( ['/a/lfn/1.txt'], checkOnlyTapeSEs = False ) self.assertTrue( res['OK'] ) self.assertEqual( res['Value']['onlineLFNs'], [] ) self.assertIn( res['Value']['offlineLFNs'], [{'SE1':['/a/lfn/1.txt']}, {'SE2':['/a/lfn/1.txt']}] ) self.assertEqual( res['Value']['absentLFNs'], {} ) self.assertEqual( res['Value']['failedLFNs'], [] )
def test_getFilesToStage_tapeSEOnly_2( self, _patch, _patched ): """ Test where the StorageElement will return file is at offline at tape """ with patch( "DIRAC.StorageManagementSystem.Client.StorageManagerClient.random.choice", new=MagicMock( return_value='SERandom' )): res = getFilesToStage( ['/a/lfn/2.txt'], checkOnlyTapeSEs = True ) self.assertTrue( res['OK'] ) self.assertEqual( res['Value']['onlineLFNs'], [] ) self.assertEqual( res['Value']['offlineLFNs'], {'SERandom': ['/a/lfn/2.txt']} ) self.assertEqual( res['Value']['absentLFNs'], {} ) self.assertEqual( res['Value']['failedLFNs'], [] )
def test_getFilesToStage_noSuchFile(self, _patch, _patched): """Test where the StorageElement will return file is absent""" res = getFilesToStage(["/a/lfn/2.txt"], checkOnlyTapeSEs=False) self.assertTrue(res["OK"]) self.assertEqual(res["Value"]["onlineLFNs"], []) self.assertEqual(res["Value"]["offlineLFNs"], {}) self.assertEqual(res["Value"]["absentLFNs"], { "/a/lfn/2.txt": "No such file or directory ( 2 : File not at SE1,SE2)" }) self.assertEqual(res["Value"]["failedLFNs"], [])
def test_getFilesToStage_withFilesToStage(self, _patch, _patched): """Test where the StorageElement mock will return files offline""" res = getFilesToStage(["/a/lfn/1.txt"], checkOnlyTapeSEs=False) self.assertTrue(res["OK"]) self.assertEqual(res["Value"]["onlineLFNs"], []) self.assertIn(res["Value"]["offlineLFNs"], [{ "SE1": ["/a/lfn/1.txt"] }, { "SE2": ["/a/lfn/1.txt"] }]) self.assertEqual(res["Value"]["absentLFNs"], {}) self.assertEqual(res["Value"]["failedLFNs"], [])
def test_getFilesToStage_tapeSEOnly_2(self, _patch, _patched): """Test where the StorageElement will return file is at offline at tape""" with patch( "DIRAC.StorageManagementSystem.Client.StorageManagerClient.random.choice", new=MagicMock(return_value="SERandom"), ): res = getFilesToStage(["/a/lfn/2.txt"], checkOnlyTapeSEs=True) self.assertTrue(res["OK"]) self.assertEqual(res["Value"]["onlineLFNs"], []) self.assertEqual(res["Value"]["offlineLFNs"], {"SERandom": ["/a/lfn/2.txt"]}) self.assertEqual(res["Value"]["absentLFNs"], {}) self.assertEqual(res["Value"]["failedLFNs"], [])
def test_getFilesToStage_tapeSEOnly_2(self, _patch, _patched): """ Test where the StorageElement will return file is at offline at tape """ with patch( "DIRAC.StorageManagementSystem.Client.StorageManagerClient.random.choice", new=MagicMock(return_value='SERandom')): res = getFilesToStage(['/a/lfn/2.txt'], checkOnlyTapeSEs=True) self.assertTrue(res['OK']) self.assertEqual(res['Value']['onlineLFNs'], []) self.assertEqual(res['Value']['offlineLFNs'], {'SERandom': ['/a/lfn/2.txt']}) self.assertEqual(res['Value']['absentLFNs'], {}) self.assertEqual(res['Value']['failedLFNs'], [])
def optimizeJob(self, jid, jobState): """ 1. Banned sites are removed from the destination list. 2. Get input files 3. Production jobs are sent directly to TQ 4. Check if staging is necessary """ # Reschedule delay result = jobState.getAttributes( ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus']) if not result['OK']: return result attDict = result['Value'] try: reschedules = int(attDict['RescheduleCounter']) except (ValueError, KeyError): return S_ERROR("RescheduleCounter has to be an integer") if reschedules != 0: delays = self.ex_getOption('RescheduleDelays', [60, 180, 300, 600]) delay = delays[min(reschedules, len(delays) - 1)] waited = toEpoch() - toEpoch(fromString(attDict['RescheduleTime'])) if waited < delay: return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay) # Get the job manifest for the later checks result = jobState.getManifest() if not result['OK']: return S_ERROR("Could not retrieve job manifest: %s" % result['Message']) jobManifest = result['Value'] # Get site requirements result = self.__getSitesRequired(jobManifest) if not result['OK']: return result userSites, userBannedSites = result['Value'] # Get job type result = jobState.getAttribute("JobType") if not result['OK']: return S_ERROR("Could not retrieve job type") jobType = result['Value'] # Get banned sites from DIRAC result = self.siteClient.getSites('Banned') if not result['OK']: return S_ERROR("Cannot retrieve banned sites from JobDB") wmsBannedSites = result['Value'] # If the user has selected any site, filter them and hold the job if not able to run if userSites: if jobType not in self.ex_getOption('ExcludedOnHoldJobTypes', []): result = self.siteClient.getUsableSites(userSites) if not result['OK']: return S_ERROR( "Problem checking userSites for tuple of active/banned/invalid sites" ) usableSites = set(result['Value']) bannedSites = [] invalidSites = [] for site in userSites: if site in wmsBannedSites: bannedSites.append(site) elif site not in usableSites: invalidSites.append(site) if invalidSites: self.jobLog.debug("Invalid site(s) requested: %s" % ','.join(invalidSites)) if not self.ex_getOption('AllowInvalidSites', True): return self.__holdJob( jobState, "Requested site(s) %s are invalid" % ",".join(invalidSites)) if bannedSites: self.jobLog.debug("Banned site(s) %s ignored" % ",".join(bannedSites)) if not usableSites: return self.__holdJob( jobState, "Requested site(s) %s are inactive" % ",".join(bannedSites)) if not usableSites: return self.__holdJob( jobState, "No requested site(s) are active/valid") userSites = list(usableSites) checkPlatform = self.ex_getOption('CheckPlatform', False) jobPlatform = jobManifest.getOption("Platform", None) # First check that the platform is valid (in OSCompatibility list) if checkPlatform and jobPlatform: result = gConfig.getOptionsDict( '/Resources/Computing/OSCompatibility') if not result['OK']: return S_ERROR("Unable to get OSCompatibility list") allPlatforms = result['Value'] if jobPlatform not in allPlatforms: self.jobLog.error("Platform not supported", jobPlatform) return S_ERROR("Platform %s is not supported" % jobPlatform) # Filter the userSites by the platform selection (if there is one) if checkPlatform and userSites: if jobPlatform: result = self.__filterByPlatform(jobPlatform, userSites) if not result['OK']: self.jobLog.error("Failed to filter job sites by platform", result['Message']) return S_ERROR("Failed to filter job sites by platform") userSites = result['Value'] if not userSites: # No sites left after filtering -> Invalid platform/sites combination self.jobLog.error("No selected sites match platform", jobPlatform) return S_ERROR("No selected sites match platform '%s'" % jobPlatform) # Check if there is input data result = jobState.getInputData() if not result['OK']: self.jobLog.error("Cannot get input data", result['Message']) return S_ERROR("Failed to get input data from JobDB") if not result['Value']: # No input data? Just send to TQ return self.__sendToTQ(jobState, jobManifest, userSites, userBannedSites) self.jobLog.verbose("Has an input data requirement") inputData = result['Value'] # =================================================================================== # Production jobs are sent to TQ, but first we have to verify if staging is necessary # =================================================================================== if jobType in Operations().getValue('Transformations/DataProcessing', []): self.jobLog.info( "Production job: sending to TQ, but first checking if staging is requested" ) res = getFilesToStage(inputData, jobState=jobState, checkOnlyTapeSEs=self.ex_getOption( 'CheckOnlyTapeSEs', True), jobLog=self.jobLog) if not res['OK']: return self.__holdJob(jobState, res['Message']) if res['Value']['absentLFNs']: # Some files do not exist at all... set the job Failed # Reverse errors reasons = {} for lfn, reason in res['Value']['absentLFNs'].iteritems(): reasons.setdefault(reason, []).append(lfn) for reason, lfns in reasons.iteritems(): # Some files are missing in the FC or in SEs, fail the job self.jobLog.error(reason, ','.join(lfns)) error = ','.join(reasons) return S_ERROR(error) if res['Value']['failedLFNs']: return self.__holdJob( jobState, "Couldn't get storage metadata of some files") stageLFNs = res['Value']['offlineLFNs'] if stageLFNs: res = self.__checkStageAllowed(jobState) if not res['OK']: return res if not res['Value']: return S_ERROR("Stage not allowed") self.__requestStaging(jobState, stageLFNs) return S_OK() else: # No staging required onlineSites = res['Value']['onlineSites'] if onlineSites: # Set the online site(s) first userSites = set(userSites) onlineSites &= userSites userSites = list(onlineSites) + list(userSites - onlineSites) return self.__sendToTQ(jobState, jobManifest, userSites, userBannedSites, onlineSites=onlineSites) # =================================================== # From now on we know it's a user job with input data # =================================================== idAgent = self.ex_getOption('InputDataAgent', 'InputData') result = self.retrieveOptimizerParam(idAgent) if not result['OK']: self.jobLog.error("Could not retrieve input data info", result['Message']) return S_ERROR("Could not retrieve input data info") opData = result['Value'] if 'SiteCandidates' not in opData: return S_ERROR("No possible site candidates") # Filter input data sites with user requirement siteCandidates = list(opData['SiteCandidates']) self.jobLog.info("Site candidates are %s" % siteCandidates) if userSites: siteCandidates = list(set(siteCandidates) & set(userSites)) siteCandidates = self._applySiteFilter(siteCandidates, banned=userBannedSites) if not siteCandidates: return S_ERROR("Impossible InputData * Site requirements") idSites = {} for site in siteCandidates: idSites[site] = opData['SiteCandidates'][site] # Check if sites have correct count of disk+tape replicas numData = len(inputData) errorSites = set() for site in idSites: if numData != idSites[site]['disk'] + idSites[site]['tape']: self.jobLog.error( "Site candidate %s does not have all the input data" % site) errorSites.add(site) for site in errorSites: idSites.pop(site) if not idSites: return S_ERROR("Site candidates do not have all the input data") # Check if staging is required stageRequired, siteCandidates = self.__resolveStaging( inputData, idSites) if not siteCandidates: return S_ERROR("No destination sites available") # Is any site active? stageSites = self._applySiteFilter(siteCandidates, banned=wmsBannedSites) if not stageSites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join(siteCandidates)) # If no staging is required send to TQ if not stageRequired: # Use siteCandidates and not stageSites because active and banned sites # will be taken into account on matching time return self.__sendToTQ(jobState, jobManifest, siteCandidates, userBannedSites) # Check if the user is allowed to stage if self.ex_getOption("RestrictDataStage", False): res = self.__checkStageAllowed(jobState) if not res['OK']: return res if not res['Value']: return S_ERROR("Stage not allowed") # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites stageSite = stageSites[0] self.jobLog.verbose(" Staging site will be %s" % (stageSite)) stageData = idSites[stageSite] # Set as if everything has already been staged stageData['disk'] += stageData['tape'] stageData['tape'] = 0 # Set the site info back to the original dict to save afterwards opData['SiteCandidates'][stageSite] = stageData stageRequest = self.__preRequestStaging(jobManifest, stageSite, opData) if not stageRequest['OK']: return stageRequest stageLFNs = stageRequest['Value'] result = self.__requestStaging(jobState, stageLFNs) if not result['OK']: return result stageLFNs = result['Value'] self.__updateSharedSESites(jobManifest, stageSite, stageLFNs, opData) # Save the optimizer data again self.jobLog.verbose('Updating %s Optimizer Info:' % (idAgent), opData) result = self.storeOptimizerParam(idAgent, opData) if not result['OK']: return result return self.__setJobSite(jobState, stageSites)
def optimizeJob( self, jid, jobState ): """ 1. Banned sites are removed from the destination list. 2. Get input files 3. Production jobs are sent directly to TQ 4. Check if staging is necessary """ # Reschedule delay result = jobState.getAttributes( [ 'RescheduleCounter', 'RescheduleTime', 'ApplicationStatus' ] ) if not result[ 'OK' ]: return result attDict = result[ 'Value' ] try: reschedules = int( attDict[ 'RescheduleCounter' ] ) except ( ValueError, KeyError ): return S_ERROR( "RescheduleCounter has to be an integer" ) if reschedules != 0: delays = self.ex_getOption( 'RescheduleDelays', [60, 180, 300, 600] ) delay = delays[ min( reschedules, len( delays ) - 1 ) ] waited = toEpoch() - toEpoch( fromString( attDict[ 'RescheduleTime' ] ) ) if waited < delay: return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay ) # Get site requirements result = self.__getSitesRequired( jobState ) if not result[ 'OK' ]: return result userSites, userBannedSites = result[ 'Value' ] # Get job type result = jobState.getAttribute( "JobType" ) if not result[ 'OK' ]: return S_ERROR( "Could not retrieve job type" ) jobType = result[ 'Value' ] # Get banned sites from DIRAC result = self.__jobDB.getSiteMask( 'Banned' ) if not result[ 'OK' ]: return S_ERROR( "Cannot retrieve banned sites from JobDB" ) wmsBannedSites = result[ 'Value' ] # If the user has selected any site, filter them and hold the job if not able to run if userSites: if jobType not in self.ex_getOption( 'ExcludedOnHoldJobTypes', [] ): result = self.__jobDB.getUserSitesTuple( userSites ) if not result[ 'OK' ]: return S_ERROR( "Problem checking userSites for tuple of active/banned/invalid sites" ) userSites, bannedSites, invalidSites = result['Value'] if invalidSites: self.jobLog.debug( "Invalid site(s) requested: %s" % ','.join( invalidSites ) ) if not self.ex_getOption( 'AllowInvalidSites', True ): return self.__holdJob( jobState, "Requested site(s) %s are invalid" % ",".join( invalidSites ) ) if bannedSites: self.jobLog.debug( "Banned site(s) %s ignored" % ",".join( bannedSites ) ) if not userSites: return self.__holdJob( jobState, "Requested site(s) %s are inactive" % ",".join( bannedSites ) ) if not userSites: return self.__holdJob( jobState, "No requested site(s) are active/valid" ) userSites = list(userSites) # Check if there is input data result = jobState.getInputData() if not result['OK']: self.jobLog.error( "Cannot get input data %s" % ( result['Message'] ) ) return S_ERROR( "Failed to get input data from JobDB" ) if not result['Value']: # No input data? Just send to TQ return self.__sendToTQ( jobState, userSites, userBannedSites ) self.jobLog.verbose( "Has an input data requirement" ) inputData = result[ 'Value' ] # Production jobs are sent to TQ, but first we have to verify if staging is necessary if jobType in Operations().getValue( 'Transformations/DataProcessing', [] ): self.jobLog.info( "Production job: sending to TQ, but first checking if staging is requested" ) userName = jobState.getAttribute( 'Owner' ) if not userName[ 'OK' ]: return userName userName = userName['Value'] userGroup = jobState.getAttribute( 'OwnerGroup' ) if not userGroup[ 'OK' ]: return userGroup userGroup = userGroup['Value'] res = getFilesToStage( inputData, proxyUserName = userName, proxyUserGroup = userGroup ) #pylint: disable=unexpected-keyword-arg if not res['OK']: return self.__holdJob( jobState, res['Message'] ) stageLFNs = res['Value']['offlineLFNs'] if stageLFNs: res = self.__checkStageAllowed( jobState ) if not res['OK']: return res if not res['Value']: return S_ERROR( "Stage not allowed" ) self.__requestStaging( jobState, stageLFNs ) return S_OK() else: return self.__sendToTQ( jobState, userSites, userBannedSites ) # From now on we know it's a user job with input data idAgent = self.ex_getOption( 'InputDataAgent', 'InputData' ) result = self.retrieveOptimizerParam( idAgent ) if not result['OK']: self.jobLog.error( "Could not retrieve input data info", result[ 'Message' ] ) return S_ERROR( "Could not retrieve input data info" ) opData = result[ 'Value' ] if 'SiteCandidates' not in opData: return S_ERROR( "No possible site candidates" ) # Filter input data sites with user requirement siteCandidates = list( opData[ 'SiteCandidates' ] ) self.jobLog.info( "Site candidates are %s" % siteCandidates ) if userSites: siteCandidates = list( set( siteCandidates ) & set( userSites ) ) siteCandidates = self._applySiteFilter( siteCandidates, banned = userBannedSites ) if not siteCandidates: return S_ERROR( "Impossible InputData * Site requirements" ) idSites = {} for site in siteCandidates: idSites[ site ] = opData[ 'SiteCandidates' ][ site ] # Check if sites have correct count of disk+tape replicas numData = len( inputData ) errorSites = set() for site in idSites: if numData != idSites[ site ][ 'disk' ] + idSites[ site ][ 'tape' ]: self.jobLog.error( "Site candidate %s does not have all the input data" % site ) errorSites.add( site ) for site in errorSites: idSites.pop( site ) if not idSites: return S_ERROR( "Site candidates do not have all the input data" ) # Check if staging is required stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites ) if not siteCandidates: return S_ERROR( "No destination sites available" ) # Is any site active? stageSites = self._applySiteFilter( siteCandidates, banned = wmsBannedSites ) if not stageSites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( siteCandidates ) ) # If no staging is required send to TQ if not stageRequired: # Use siteCandidates and not stageSites because active and banned sites # will be taken into account on matching time return self.__sendToTQ( jobState, siteCandidates, userBannedSites ) # Check if the user is allowed to stage if self.ex_getOption( "RestrictDataStage", False ): res = self.__checkStageAllowed( jobState ) if not res['OK']: return res if not res['Value']: return S_ERROR( "Stage not allowed" ) # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites stageSite = stageSites[0] self.jobLog.verbose( " Staging site will be %s" % ( stageSite ) ) stageData = idSites[ stageSite ] # Set as if everything has already been staged stageData[ 'disk' ] += stageData[ 'tape' ] stageData[ 'tape' ] = 0 # Set the site info back to the original dict to save afterwards opData[ 'SiteCandidates' ][ stageSite ] = stageData stageRequest = self.__preRequestStaging( jobState, stageSite, opData ) if not stageRequest['OK']: return stageRequest stageLFNs = stageRequest['Value'] result = self.__requestStaging( jobState, stageLFNs ) if not result[ 'OK' ]: return result stageLFNs = result[ 'Value' ] self.__updateSharedSESites( jobState, stageSite, stageLFNs, opData ) # Save the optimizer data again self.jobLog.verbose( 'Updating %s Optimizer Info:' % ( idAgent ), opData ) result = self.storeOptimizerParam( idAgent, opData ) if not result[ 'OK' ]: return result return self.__setJobSite( jobState, stageSites )
def optimizeJob(self, jid, jobState): """ 1. Banned sites are removed from the destination list. 2. Get input files 3. Production jobs are sent directly to TQ 4. Check if staging is necessary """ # Reschedule delay result = jobState.getAttributes(['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus']) if not result['OK']: return result attDict = result['Value'] try: reschedules = int(attDict['RescheduleCounter']) except (ValueError, KeyError): return S_ERROR("RescheduleCounter has to be an integer") if reschedules != 0: delays = self.ex_getOption('RescheduleDelays', [60, 180, 300, 600]) delay = delays[min(reschedules, len(delays) - 1)] waited = toEpoch() - toEpoch(fromString(attDict['RescheduleTime'])) if waited < delay: return self.__holdJob(jobState, 'On Hold: after rescheduling %s' % reschedules, delay) # Get the job manifest for the later checks result = jobState.getManifest() if not result['OK']: return S_ERROR("Could not retrieve job manifest: %s" % result['Message']) jobManifest = result['Value'] # Get site requirements result = self.__getSitesRequired(jobManifest) if not result['OK']: return result userSites, userBannedSites = result['Value'] # Get job type result = jobState.getAttribute("JobType") if not result['OK']: return S_ERROR("Could not retrieve job type") jobType = result['Value'] # Get banned sites from DIRAC result = self.siteClient.getSites('Banned') if not result['OK']: return S_ERROR("Cannot retrieve banned sites from JobDB") wmsBannedSites = result['Value'] # If the user has selected any site, filter them and hold the job if not able to run if userSites: if jobType not in self.ex_getOption('ExcludedOnHoldJobTypes', []): result = self.siteClient.getUsableSites(userSites) if not result['OK']: return S_ERROR("Problem checking userSites for tuple of active/banned/invalid sites") usableSites = set(result['Value']) bannedSites = [] invalidSites = [] for site in userSites: if site in wmsBannedSites: bannedSites.append(site) elif site not in usableSites: invalidSites.append(site) if invalidSites: self.jobLog.debug("Invalid site(s) requested: %s" % ','.join(invalidSites)) if not self.ex_getOption('AllowInvalidSites', True): return self.__holdJob(jobState, "Requested site(s) %s are invalid" % ",".join(invalidSites)) if bannedSites: self.jobLog.debug("Banned site(s) %s ignored" % ",".join(bannedSites)) if not usableSites: return self.__holdJob(jobState, "Requested site(s) %s are inactive" % ",".join(bannedSites)) if not usableSites: return self.__holdJob(jobState, "No requested site(s) are active/valid") userSites = list(usableSites) checkPlatform = self.ex_getOption('CheckPlatform', False) jobPlatform = jobManifest.getOption("Platform", None) # First check that the platform is valid (in OSCompatibility list) if checkPlatform and jobPlatform: result = gConfig.getOptionsDict('/Resources/Computing/OSCompatibility') if not result['OK']: return S_ERROR("Unable to get OSCompatibility list") allPlatforms = result['Value'] if jobPlatform not in allPlatforms: self.jobLog.error("Platform %s is not supported" % jobPlatform) return S_ERROR("Platform %s is not supported" % jobPlatform) # Filter the userSites by the platform selection (if there is one) if checkPlatform and userSites: if jobPlatform: result = self.__filterByPlatform(jobPlatform, userSites) if not result['OK']: self.jobLog.error("Failed to filter job sites by platform: %s" % result['Message']) return S_ERROR("Failed to filter job sites by platform") userSites = result['Value'] if not userSites: # No sites left after filtering -> Invalid platform/sites combination self.jobLog.error("No selected sites match platform '%s'" % jobPlatform) return S_ERROR("No selected sites match platform '%s'" % jobPlatform) # Check if there is input data result = jobState.getInputData() if not result['OK']: self.jobLog.error("Cannot get input data %s" % (result['Message'])) return S_ERROR("Failed to get input data from JobDB") if not result['Value']: # No input data? Just send to TQ return self.__sendToTQ(jobState, jobManifest, userSites, userBannedSites) self.jobLog.verbose("Has an input data requirement") inputData = result['Value'] # =================================================================================== # Production jobs are sent to TQ, but first we have to verify if staging is necessary # =================================================================================== if jobType in Operations().getValue('Transformations/DataProcessing', []): self.jobLog.info("Production job: sending to TQ, but first checking if staging is requested") res = getFilesToStage(inputData, jobState=jobState, checkOnlyTapeSEs=self.ex_getOption('CheckOnlyTapeSEs', True), jobLog=self.jobLog) if not res['OK']: return self.__holdJob(jobState, res['Message']) if res['Value']['absentLFNs']: # Some files do not exist at all... set the job Failed # Reverse errors reasons = {} for lfn, reason in res['Value']['absentLFNs'].iteritems(): reasons.setdefault(reason, []).append(lfn) for reason, lfns in reasons.iteritems(): # Some files are missing in the FC or in SEs, fail the job self.jobLog.error(reason, ','.join(lfns)) error = ','.join(reasons) return S_ERROR(error) if res['Value']['failedLFNs']: return self.__holdJob(jobState, "Couldn't get storage metadata of some files") stageLFNs = res['Value']['offlineLFNs'] if stageLFNs: res = self.__checkStageAllowed(jobState) if not res['OK']: return res if not res['Value']: return S_ERROR("Stage not allowed") self.__requestStaging(jobState, stageLFNs) return S_OK() else: # No staging required onlineSites = res['Value']['onlineSites'] if onlineSites: # Set the online site(s) first userSites = set(userSites) onlineSites &= userSites userSites = list(onlineSites) + list(userSites - onlineSites) return self.__sendToTQ(jobState, jobManifest, userSites, userBannedSites, onlineSites=onlineSites) # =================================================== # From now on we know it's a user job with input data # =================================================== idAgent = self.ex_getOption('InputDataAgent', 'InputData') result = self.retrieveOptimizerParam(idAgent) if not result['OK']: self.jobLog.error("Could not retrieve input data info", result['Message']) return S_ERROR("Could not retrieve input data info") opData = result['Value'] if 'SiteCandidates' not in opData: return S_ERROR("No possible site candidates") # Filter input data sites with user requirement siteCandidates = list(opData['SiteCandidates']) self.jobLog.info("Site candidates are %s" % siteCandidates) if userSites: siteCandidates = list(set(siteCandidates) & set(userSites)) siteCandidates = self._applySiteFilter(siteCandidates, banned=userBannedSites) if not siteCandidates: return S_ERROR("Impossible InputData * Site requirements") idSites = {} for site in siteCandidates: idSites[site] = opData['SiteCandidates'][site] # Check if sites have correct count of disk+tape replicas numData = len(inputData) errorSites = set() for site in idSites: if numData != idSites[site]['disk'] + idSites[site]['tape']: self.jobLog.error("Site candidate %s does not have all the input data" % site) errorSites.add(site) for site in errorSites: idSites.pop(site) if not idSites: return S_ERROR("Site candidates do not have all the input data") # Check if staging is required stageRequired, siteCandidates = self.__resolveStaging(inputData, idSites) if not siteCandidates: return S_ERROR("No destination sites available") # Is any site active? stageSites = self._applySiteFilter(siteCandidates, banned=wmsBannedSites) if not stageSites: return self.__holdJob(jobState, "Sites %s are inactive or banned" % ", ".join(siteCandidates)) # If no staging is required send to TQ if not stageRequired: # Use siteCandidates and not stageSites because active and banned sites # will be taken into account on matching time return self.__sendToTQ(jobState, jobManifest, siteCandidates, userBannedSites) # Check if the user is allowed to stage if self.ex_getOption("RestrictDataStage", False): res = self.__checkStageAllowed(jobState) if not res['OK']: return res if not res['Value']: return S_ERROR("Stage not allowed") # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites stageSite = stageSites[0] self.jobLog.verbose(" Staging site will be %s" % (stageSite)) stageData = idSites[stageSite] # Set as if everything has already been staged stageData['disk'] += stageData['tape'] stageData['tape'] = 0 # Set the site info back to the original dict to save afterwards opData['SiteCandidates'][stageSite] = stageData stageRequest = self.__preRequestStaging(jobManifest, stageSite, opData) if not stageRequest['OK']: return stageRequest stageLFNs = stageRequest['Value'] result = self.__requestStaging(jobState, stageLFNs) if not result['OK']: return result stageLFNs = result['Value'] self.__updateSharedSESites(jobManifest, stageSite, stageLFNs, opData) # Save the optimizer data again self.jobLog.verbose('Updating %s Optimizer Info:' % (idAgent), opData) result = self.storeOptimizerParam(idAgent, opData) if not result['OK']: return result return self.__setJobSite(jobState, stageSites)