def split(self, job): logger.debug("split") if self.filesPerJob < 1: logger.error('filesPerJob must be greater than 0.') raise SplittingError('filesPerJob < 1 : %d' % self.filesPerJob) elif (self.maxFiles is not None) and self.maxFiles < 1: logger.error('maxFiles must be greater than 0.') raise SplittingError('maxFiles < 1 : %d' % self.maxFiles) subjobs = [] logger.debug("Creating all_jobs") all_jobs = self._splitter(job, job.inputdata) logger.info("Constructing subjobs") logger.debug("Filling DataSet") for dataset in all_jobs: logger.debug("Creating Subjobs with dataset of size: %s" % str(len(dataset))) #logger.debug( "Creating Subjobs with dataset: %s" % str(dataset) ) subjobs.append(self._create_subjob(job, dataset)) logger.info("Finished Splitting") logger.debug("Returning all subjobs") return subjobs
def wrapped_execute(command, expected_type): """ A wrapper around execute to protect us from commands which had errors Args: command (str): This is the command to be exectuted against DIRAC expected_type (type): This is the type of the object which is returned from DIRAC """ try: result = execute(command) assert isinstance(result, expected_type) except AssertionError: raise SplittingError("Output from DIRAC expected to be of type: '%s', we got the following: '%s'" % (expected_type, result)) except GangaDiracError as err: raise SplittingError("Error from Dirac: %s" % err) return result
def simple_split(files_per_job, inputs): """Just splits the files in the order they came""" def create_subdataset(data_inputs,iter_begin,iter_end): dataset = BesDataset() dataset.depth = data_inputs.depth dataset.files = data_inputs.files[iter_begin:iter_end] return dataset result = [] end = 0 inputs_length = len(inputs.files) for i in range(inputs_length // files_per_job): start = i * files_per_job end = start + files_per_job result.append(create_subdataset(inputs,start,end)) if end < (inputs_length): result.append(create_subdataset(inputs,end,None)) #catch file loss result_length = 0 for r in result: result_length += len(r.files) if result_length != inputs_length: raise SplittingError('Data files lost during splitting, please send '\ 'a bug report to the Ganga team.') return result
def split(self,job): if self.evtMaxPerJob > 50000: raise SplittingError('evtMaxPerJob is larger than 50000 : %d. Please set a smaller number' % self.evtMaxPerJob) self._jobProperties = [] self._prepare(job) if self.seed is None: seedStart = self._getSeedStart() else: seedStart = self.seed if self.outputEvtNum: f = open(self.outputEvtNum, 'w') for jobProperty in self._jobProperties: print >>f, '%8d %8d %8d'%(jobProperty['runL'], jobProperty['runH'], jobProperty['eventNum']) f.close() realTotalNum = 0 subjobs=[] rndmSeed = seedStart for jobProperty in self._jobProperties: realTotalNum += jobProperty['eventNum'] subjob = create_gaudi_subjob(job, job.inputdata) self._createSimJob(subjob, jobProperty, rndmSeed) if job.application.recoptsfile: self._createRecJob(subjob, jobProperty, rndmSeed) if job.application.anaoptsfile: self._createAnaJob(subjob, jobProperty) subjob.application.extra.output_name = jobProperty['filename'] subjob.application.extra.metadata['round'] = jobProperty['round'] subjob.application.extra.metadata['runFrom'] = jobProperty['runFrom'] subjob.application.extra.metadata['runTo'] = jobProperty['runTo'] for step in subjob.application.output_step: if step in subjob.application.extra.data_type: subjob.application.extra.output_files.append(step + '/' + subjob.application.extra.output_name + '.' + subjob.application.extra.data_type[step]) subjobs.append(subjob) rndmSeed += 1 (runFrom, runTo) = get_runLH(job.application.extra.run_ranges) round = get_round_nums(job.application.extra.run_ranges)[0] taskInfo = {} taskInfo['SplitterType'] = self.__class__.__name__ taskInfo['SeedStart'] = seedStart taskInfo['TotalEventNum'] = self.evtTotal taskInfo['EventMax'] = self.evtMaxPerJob taskInfo['RealTotalEventNum'] = realTotalNum taskInfo['Round'] = round taskInfo['RunFrom'] = runFrom taskInfo['RunTo'] = runTo gDiracTask.updateTaskInfo(taskInfo) return subjobs
def badLFNCheck(bad_lfns, allLFNs, LFNdict, ignoremissing, allLFNData): """ Method to re-sort the LFN replica data and check for bad LFNs Args: bad_lfns (list): This is the list which will contain LFNs which have no replicas allLFNs (list): List of all of the LFNs in the inputs which have accessible replicas LFNdict (dict): dict of LFN to DiracFiles ignoremissing (bool): Check if we have any bad lfns allLFNData (dict): All LFN replica data """ # FIXME here to keep the repo settings as they were before we changed the # flush count original_write_perm = {} global LFN_parallel_limit global limit_divide_one for i in range(int(math.ceil(float(len(allLFNs)) * limit_divide_one))): output = allLFNData.get(i) if output is None: msg = "Error getting Replica information from Dirac: [%s,%s]" % ( str(i * LFN_parallel_limit), str((i + 1) * LFN_parallel_limit)) raise SplittingError(msg) # Identify files which have Failed to be found by DIRAC results = output values = results.get('Successful') upper_limit = (i+1)*LFN_parallel_limit if upper_limit > len(allLFNs): upper_limit = len(allLFNs) #logger.debug("Updating LFN Physical Locations: [%s:%s] of %s" % (str(i * LFN_parallel_limit), str(upper_limit), str(len(allLFNs)))) for this_lfn in values.keys(): #logger.debug("LFN: %s" % str(this_lfn)) this_dict = {} this_dict[this_lfn] = values.get(this_lfn) if this_lfn in LFNdict: #logger.debug("Updating RemoteURLs") LFNdict[this_lfn]._updateRemoteURLs(this_dict) #logger.debug("This_dict: %s" % str(this_dict)) else: logger.error("Error updating remoteURLs for: %s" % str(this_lfn)) # If we find NO replicas then also mark this as bad! if this_dict[this_lfn].keys() == []: bad_lfns.append(this_lfn) for this_lfn in bad_lfns: logger.warning("LFN: %s was either unknown to DIRAC or unavailable, Ganga is ignoring it!" % str(this_lfn)) if this_lfn in LFNdict: del LFNdict[this_lfn] if this_lfn in allLFNs: allLFNs.remove(this_lfn)
def wrapped_execute(command, expected_type): """ A wrapper around execute to protect us from commands which had errors """ try: result = execute(command) assert isinstance(result, expected_type) except AssertionError: raise SplittingError( "Output from DIRAC expected to be of type: '%s', we got the following: '%s'" % (expected_type, result)) return result
def _splitFiles(self,inputs): # don't let user use this if they're using the Dirac backend job = None try: job = self.getJobObject() except: pass if job: if job.backend.__module__.find('Dirac') > 0: msg = 'SplitByFiles should not be used w/ the Dirac backend.'\ ' You probably want the DiracSplitter.' raise SplittingError(msg) return simple_split(self.filesPerJob,inputs)
def _splitFiles(self, inputs): from GangaBoss.Lib.Dataset import LogicalFile files = [] for f in inputs.files: if isLFN(f): files.append(f.name) print f.name if self.maxFiles > 0 and len(files) >= self.maxFiles: break cmd = 'result = DiracCommands.splitInputData(%s,%d)' \ % (files,self.filesPerJob) print cmd result = Dirac.execAPI(cmd) if not result_ok(result): logger.error('Error splitting files: %s' % str(result)) raise SplittingError('Error splitting files.') split_files = result.get('Value', []) if len(split_files) == 0: raise SplittingError('An unknown error occured.') datasets = [] # check that all files were available on the grid big_list = [] for l in split_files: big_list.extend(l) diff = set(inputs.getFileNames()).difference(big_list) if len(diff) > 0: for f in diff: logger.warning('Ignored file: %s' % f) if not self.ignoremissing: raise SplittingError('Some files not found!') for l in split_files: dataset = BesDataset() dataset.depth = inputs.depth for file in l: dataset.files.append(LogicalFile(file)) datasets.append(dataset) return datasets
def split(self, job): subjobs = [] for arg in self.args: j = self.createSubjob(job,['application']) # Add new arguments to subjob app = copy.deepcopy(job.application) if hasattr(app, 'args'): app.args = arg elif hasattr(app, 'extraArgs'): app.extraArgs = arg else: raise SplittingError('Application has neither args or extraArgs in its schema') j.application = app logger.debug('Arguments for split job is: ' + str(arg)) subjobs.append(stripProxy(j)) return subjobs
def lookUpLFNReplicas(inputs, ignoremissing): """ This method launches several worker threads to collect the replica information for all LFNs which are given as inputs and stores this in allLFNData Args: inputs (list): This is a list of input DiracFile which are Returns: allLFNs (list): List of all of the LFNs in the inputs LFNdict (dict): dict of LFN to DiracFiles """ allLFNData = {} # Build a useful dictionary and list allLFNs = [_lfn.lfn for _lfn in inputs] LFNdict = dict.fromkeys(allLFNs) for _lfn in inputs: LFNdict[_lfn.lfn] = _lfn # Request the replicas for all LFN 'LFN_parallel_limit' at a time to not overload the # server and give some feedback as this is going on global limit_divide_one for i in range(int(math.ceil(float(len(allLFNs)) * limit_divide_one))): getQueues()._monitoring_threadpool.add_function(getLFNReplicas, (allLFNs, i, allLFNData)) while len(allLFNData) != int(math.ceil(float(len(allLFNs)) * limit_divide_one)): time.sleep(1.) # This can take a while so lets protect any repo locks import Ganga.Runtime.Repository_runtime Ganga.Runtime.Repository_runtime.updateLocksNow() bad_lfns = [] # Sort this information and store is in the relevant Ganga objects badLFNCheck(bad_lfns, allLFNs, LFNdict, ignoremissing, allLFNData) # Check if we have any bad lfns if bad_lfns and ignoremissing is False: logger.error("Errors found getting LFNs:\n%s" % str(bad_lfns)) raise SplittingError("Error trying to split dataset with invalid LFN and ignoremissing = False") return allLFNs, LFNdict, bad_lfns
def _splitter(self, job, inputdata): logger.debug("_splitter") indata = stripProxy(copy.deepcopy(job.inputdata)) if not job.inputdata: share_path = os.path.join( expandfilename(getConfig('Configuration')['gangadir']), 'shared', getConfig('Configuration')['user'], job.application.is_prepared.name, 'inputdata', 'options_data.pkl') if os.path.exists(share_path): f = open(share_path, 'r+b') indata = pickle.load(f) f.close() else: logger.error('Cannot split if no inputdata given!') raise SplittingError( 'job.inputdata is None and no inputdata found in optsfile') self.depth = indata.depth self.persistency = indata.persistency self.XMLCatalogueSlice = indata.XMLCatalogueSlice if stripProxy(job.backend).__module__.find('Dirac') > 0: if self.filesPerJob > 100: self.filesPerJob = 100 # see above warning logger.debug("indata: %s " % str(indata)) outdata = DiracSplitter(indata, self.filesPerJob, self.maxFiles, self.ignoremissing) logger.debug("outdata: %s " % str(outdata)) return outdata else: logger.error( "This Splitter HAS NOT, yet been implemented for all IGangaFile objects" ) raise NotImplementedError
def split(self,job): if self.filesPerJob < 1: logger.error('filesPerJob must be greater than 0.') raise SplittingError('filesPerJob < 1 : %d' % self.filesPerJob) subjobs=[] inputdata = job.inputdata if hasattr(job.application,'extra'): inputdata = job.application.extra.inputdata inputs = BesDataset() inputs.depth = inputdata.depth if int(self.maxFiles) == -1: inputs.files = inputdata.files[:] logger.info("Using all %d input files for splitting" % len(inputs)) else: inputs.files = inputdata.files[:self.maxFiles] logger.info("Only using a maximum of %d inputfiles" % int(self.maxFiles)) datasetlist = self._splitFiles(inputs) for dataset in datasetlist: subjobs.append(create_gaudi_subjob(job,dataset)) return subjobs
def OfflineGangaDiracSplitter(_inputs, filesPerJob, maxFiles, ignoremissing, bannedSites=[]): """ Generator that yields a datasets for dirac split jobs Args: _inputs (list): This is a list of input DiracFile objects filesPerJob (int): Max files per jobs as defined by splitter maxFiles (int): This is the max number of files per subset(subjob) ignoremissing (bool): Should we ignore missing LFN bannedSites (list): List of banned sites of which the SEs will not be used Yields: dataset (list): A list of LFNs for each subset(subjob) """ if maxFiles is not None and maxFiles > 0: inputs = _inputs[:maxFiles] else: inputs = _inputs # First FIND ALL LFN REPLICAS AND SE<->SITE MAPPINGS AND STORE THIS IN MEMORY # THIS IS DONE IN PARALLEL TO AVOID OVERLOADING DIRAC WITH THOUSANDS OF # REQUESTS AT ONCE ON ONE CONNECTION wanted_common_site = configDirac['OfflineSplitterMaxCommonSites'] uniqueSE = configDirac['OfflineSplitterUniqueSE'] if inputs is None: raise SplittingError("Cannot Split Job as the inputdata appears to be None!") if len(inputs.getLFNs()) != len(inputs.files): raise SplittingError("Error trying to split dataset using DIRAC backend with non-DiracFile in the inputdata") file_replicas = {} logger.info("Requesting LFN replica info") # Perform a lookup of where LFNs are all stored allLFNs, LFNdict, bad_lfns = lookUpLFNReplicas(inputs, ignoremissing) # This finds all replicas for all LFNs... # This will probably struggle for LFNs which don't exist # Bad LFN should have been removed by this point however all_lfns = [LFNdict[this_lfn].locations for this_lfn in LFNdict if this_lfn not in bad_lfns] logger.info("Got all good replicas") for this_input in inputs: if this_input.lfn not in bad_lfns: file_replicas[this_input.lfn] = this_input.locations logger.info("found all replicas") site_to_SE_mapping = {} SE_to_site_mapping = {} allSubSets = [] # Now lets generate a dictionary of some chosen site vs LFN to use in # constructing subsets site_dict = calculateSiteSEMapping(file_replicas, uniqueSE, site_to_SE_mapping, SE_to_site_mapping, bannedSites, ignoremissing) allChosenSets = {} # Now select a set of site to use as a seed for constructing a subset of # LFN for lfn in site_dict.keys(): allChosenSets[lfn] = generate_site_selection(site_dict[lfn], wanted_common_site, uniqueSE, site_to_SE_mapping, SE_to_site_mapping) logger.debug("Found all SE in use") # BELOW IS WHERE THE ACTUAL SPLITTING IS DONE logger.info("Calculating best data subsets") allSubSets = performSplitting(site_dict, filesPerJob, allChosenSets, wanted_common_site, uniqueSE, site_to_SE_mapping, SE_to_site_mapping) avg = 0. for this_set in allSubSets: avg += float(len(this_set)) avg /= float(len(allSubSets)) logger.info("Average Subset size is: %s" % (str(avg))) # FINISHED SPLITTING CHECK!!! check_count = 0 for i in allSubSets: check_count = check_count + len(i) if check_count != len(inputs) - len(bad_lfns): logger.error("SERIOUS SPLITTING ERROR!!!!!") raise SplitterError("Files Missing after Splitting!") else: logger.info("File count checked! Ready to Submit") # RETURN THE RESULT logger.info("Created %s subsets" % len(allSubSets)) for dataset in allSubSets: yield dataset
def OfflineGangaDiracSplitter(_inputs, filesPerJob, maxFiles, ignoremissing): """ Generator that yields a datasets for dirac split jobs """ if maxFiles is not None and maxFiles > 0: inputs = _inputs[:maxFiles] else: inputs = _inputs # First FIND ALL LFN REPLICAS AND SE<->SITE MAPPINGS AND STORE THIS IN MEMORY # THIS IS DONE IN PARALLEL TO AVOID OVERLOADING DIRAC WITH THOUSANDS OF # REQUESTS AT ONCE ON ONE CONNECTION wanted_common_site = configDirac['OfflineSplitterMaxCommonSites'] iterative_limit = configDirac['OfflineSplitterLimit'] good_fraction = configDirac['OfflineSplitterFraction'] uniqueSE = configDirac['OfflineSplitterUniqueSE'] split_files = [] if inputs is None: raise SplittingError( "Cannot Split Job as the inputdata appears to be None!") if len(inputs.getLFNs()) != len(inputs.files): raise SplittingError( "Error trying to split dataset using DIRAC backend with non-DiracFile in the inputdata" ) file_replicas = {} logger.info("Requesting LFN replica info") allLFNData = {} # Perform a lookup of where LFNs are all stored allLFNs, LFNdict = lookUpLFNReplicas(inputs, allLFNData) for _lfn in allLFNData: if allLFNData[_lfn] is None: logger.error( "Error in Getting LFN Replica information, aborting split") raise SplittingError( "Error in Getting LFN Replica information, aborting split") bad_lfns = [] # Sort this information and store is in the relevant Ganga objects errors = sortLFNreplicas(bad_lfns, allLFNs, LFNdict, ignoremissing, allLFNData, inputs) if len(bad_lfns) != 0: if ignoremissing is False: logger.error("Errors found getting LFNs:\n%s" % str(errors)) raise SplittingError( "Error trying to split dataset with invalid LFN and ignoremissing = False" ) # This finds all replicas for all LFNs... # This will probably struggle for LFNs which don't exist # Bad LFN should have been removed by this point however all_lfns = [ LFNdict[this_lfn].locations for this_lfn in LFNdict if this_lfn not in bad_lfns ] logger.info("Got replicas") for this_input in inputs: if this_input.lfn not in bad_lfns: file_replicas[this_input.lfn] = this_input.locations logger.info("found all replicas") logger.info("Calculating site<->SE Mapping") site_to_SE_mapping = {} SE_to_site_mapping = {} # Now lets generate a dictionary of some chosen site vs LFN to use in # constructing subsets site_dict, allSubSets, allChosenSets = calculateSiteSEMapping( file_replicas, wanted_common_site, uniqueSE, site_to_SE_mapping, SE_to_site_mapping) logger.debug("Found all SE in use") # BELOW IS WHERE THE ACTUAL SPLITTING IS DONE logger.info("Calculating best data subsets") iterations = 0 # Loop over all LFNs while len(site_dict.keys()) > 0: # LFN left to be used # NB: Can't modify this list and iterate over it directly in python LFN_instances = site_dict.keys() # Already used LFN chosen_lfns = set() for iterating_LFN in LFN_instances: # If this has previously been selected lets ignore it and move on if iterating_LFN in chosen_lfns: continue # Use this seed to try and construct a subset req_sitez = allChosenSets[iterating_LFN] _this_subset = [] #logger.debug("find common LFN for: " + str(allChosenSets[iterating_LFN])) # Construct subset # Starting with i, populate subset with LFNs which have an # overlap of at least 2 SE for this_LFN in LFN_instances: if this_LFN in chosen_lfns: continue if req_sitez.issubset(site_dict[this_LFN]): if len(_this_subset) >= filesPerJob: break _this_subset.append(this_LFN) limit = int(math.floor(float(filesPerJob) * good_fraction)) #logger.debug("Size limit: %s" % str(limit)) # If subset is too small throw it away if len(_this_subset) < limit: #logger.debug("%s < %s" % (str(len(_this_subset)), str(limit))) allChosenSets[iterating_LFN] = generate_site_selection( site_dict[iterating_LFN], wanted_common_site, uniqueSE, site_to_SE_mapping, SE_to_site_mapping) continue else: logger.debug("found common LFN for: " + str(allChosenSets[iterating_LFN])) logger.debug("%s > %s" % (str(len(_this_subset)), str(limit))) # else Dataset was large enough to be considered useful logger.debug("Generating Dataset of size: %s" % str(len(_this_subset))) ## Construct DiracFile here as we want to keep the above combination allSubSets.append([ DiracFile(lfn=str(this_LFN)) for this_LFN in _this_subset ]) for lfn in _this_subset: site_dict.pop(lfn) allChosenSets.pop(lfn) chosen_lfns.add(lfn) # Lets keep track of how many times we've tried this iterations = iterations + 1 # Can take a while so lets not let threads become un-locked import Ganga.Runtime.Repository_runtime Ganga.Runtime.Repository_runtime.updateLocksNow() # If on final run, will exit loop after this so lets try and cleanup if iterations >= iterative_limit: if good_fraction < 0.5: good_fraction = good_fraction * 0.75 iterations = 0 elif wanted_common_site > 1: logger.debug("Reducing Common Site Size") wanted_common_site = wanted_common_site - 1 iterations = 0 good_fraction = 0.75 else: good_fraction = good_fraction * 0.75 logger.debug("good_fraction: %s" % str(good_fraction)) split_files = allSubSets avg = float() for this_set in allSubSets: avg += float(len(this_set)) avg /= float(len(allSubSets)) logger.info("Average Subset size is: %s" % (str(avg))) # FINISHED SPLITTING CHECK!!! check_count = 0 for i in split_files: check_count = check_count + len(i) if check_count != len(inputs) - len(bad_lfns): logger.error("SERIOUS SPLITTING ERROR!!!!!") raise SplitterError("Files Missing after Splitting!") else: logger.info("File count checked! Ready to Submit") # RETURN THE RESULT logger.info("Created %s subsets" % str(len(split_files))) #logger.info( "Split Files: %s" % str(split_files) ) for dataset in split_files: yield dataset
def _splitter(self, job, inputdata): if (inputdata is None) or (len(inputdata.files) == 0): logger.error('Cannot split if no inputdata given!') raise SplittingError('inputdata is None') logger.debug("Found %s files inputdata" % str(len(inputdata.files))) return DatasetSplitter(inputdata, self.filesPerJob, self.maxFiles)
def DiracSplitter(inputs, filesPerJob, maxFiles, ignoremissing): """ Generator that yields a datasets for dirac split jobs """ #logger.debug( "DiracSplitter" ) #logger.debug( "inputs: %s" % str( inputs ) ) split_files = [] i = inputs.__class__() if len(inputs.getLFNs()) != len(inputs.files): raise SplittingError( "Error trying to split dataset using DIRAC backend with non-DiracFile in the inputdata" ) all_files = igroup(inputs.files[:maxFiles], getConfig('DIRAC')['splitFilesChunks'], leftovers=True) #logger.debug( "Looping over all_files" ) #logger.debug( "%s" % str( all_files ) ) for files in all_files: i.files = files LFNsToSplit = i.getLFNs() if (len(LFNsToSplit)) > 1: result = execute('splitInputData(%s, %d)' % (i.getLFNs(), filesPerJob)) if not result_ok(result): logger.error('DIRAC:: Error splitting files: %s' % str(result)) raise SplittingError('Error splitting files.') split_files += result.get('Value', []) else: split_files = [LFNsToSplit] if len(split_files) == 0: raise SplittingError('An unknown error occured.') # FIXME # check that all files were available on the grid big_list = [] for l in split_files: big_list.extend(l) diff = set(inputs.getFileNames()[:maxFiles]).difference(big_list) if len(diff) > 0: for f in diff: logger.warning('Ignored file: %s' % f) if not ignoremissing: raise SplittingError('Some files not found!') ### logger.debug("Split Files: %s" % str(split_files)) for _dataset in split_files: dataset = [] for _lfn in _dataset: dataset.append(DiracFile(lfn=_lfn)) yield dataset
def GangaDiracSplitter(inputs, filesPerJob, maxFiles, ignoremissing): """ Generator that yields a datasets for dirac split jobs """ split_files = [] i = inputs.__class__() if len(inputs.getLFNs()) != len(inputs.files): raise SplittingError( "Error trying to split dataset using DIRAC backend with non-DiracFile in the inputdata") file_replicas = {} from Ganga.GPI import queues for i in inputs: #logging.debug( "getting metadata: %s" % str(i.lfn) ) queues.add(i.getReplicas) logger.info("Requesting LFN replica info") # This finds all replicas for all LFNs... # This will probably struggle for LFNs which don't exist all_lfns = [i.locations for i in inputs] while [] in all_lfns: import time time.sleep(0.5) all_lfns = [i.locations for i in inputs] logger.info("Got replicas") for i in inputs: file_replicas[i.lfn] = i.locations #logger.info( "%s" % str( i.accessURL() ) ) logger.debug("found all replicas") super_dict = dict() for lfn, repz in file_replicas.iteritems(): sitez = set([]) for i in repz: # print i sitez.add(i) super_dict[lfn] = sitez allSubSets = [] allChosenSets = {} logger.info("Determining overlap") import random for i in super_dict.keys(): # Randomly Select 2 SE as the starting point for spliting jobs if len(super_dict[i]) > 2: req_sitez = set([]) chosen = random.sample(super_dict[i], 2) for s in chosen: req_sitez.add(s) # Keep the 2 or less SE as the SE of choice else: req_sitez = set([]) for s in super_dict[i]: req_sitez.add(s) allChosenSets[i] = req_sitez logger.debug("Found all SE in use") Tier1Sites = set([]) for i in super_dict.keys(): req_sitez = allChosenSets[i] _this_subset = [] # Starting with i, populate subset with LFNs which have an # overlap of at least 2 SE for k in super_dict.keys(): if req_sitez.issubset(super_dict[k]): if len(_this_subset) >= filesPerJob: break _this_subset.append(str(k)) super_dict.pop(k) if len(_this_subset) > 0: allSubSets.append(_this_subset) split_files = allSubSets logger.info("Created %s subsets" % str(len(split_files))) #logger.info( "Split Files: %s" % str(split_files) ) for dataset in split_files: yield dataset