Exemple #1
0
    def split(self, job):
        logger.debug("split")
        if self.filesPerJob < 1:
            logger.error('filesPerJob must be greater than 0.')
            raise SplittingError('filesPerJob < 1 : %d' % self.filesPerJob)
        elif (self.maxFiles is not None) and self.maxFiles < 1:
            logger.error('maxFiles must be greater than 0.')
            raise SplittingError('maxFiles < 1 : %d' % self.maxFiles)

        subjobs = []

        logger.debug("Creating all_jobs")
        all_jobs = self._splitter(job, job.inputdata)

        logger.info("Constructing subjobs")
        logger.debug("Filling DataSet")
        for dataset in all_jobs:
            logger.debug("Creating Subjobs with dataset of size: %s" %
                         str(len(dataset)))
            #logger.debug( "Creating Subjobs with dataset: %s" % str(dataset) )
            subjobs.append(self._create_subjob(job, dataset))

        logger.info("Finished Splitting")
        logger.debug("Returning all subjobs")
        return subjobs
def wrapped_execute(command, expected_type):
    """
    A wrapper around execute to protect us from commands which had errors
    Args:
        command (str): This is the command to be exectuted against DIRAC
        expected_type (type): This is the type of the object which is returned from DIRAC
    """
    try:
        result = execute(command)
        assert isinstance(result, expected_type)
    except AssertionError:
        raise SplittingError("Output from DIRAC expected to be of type: '%s', we got the following: '%s'" % (expected_type, result))
    except GangaDiracError as err:
        raise SplittingError("Error from Dirac: %s" % err)
    return result
Exemple #3
0
def simple_split(files_per_job, inputs):
    """Just splits the files in the order they came"""

    def create_subdataset(data_inputs,iter_begin,iter_end):
        dataset = BesDataset()
        dataset.depth = data_inputs.depth
        dataset.files = data_inputs.files[iter_begin:iter_end]
        return dataset

    result = []
    end = 0
    inputs_length = len(inputs.files)

    for i in range(inputs_length // files_per_job):
        start = i * files_per_job
        end = start + files_per_job
        result.append(create_subdataset(inputs,start,end))

    if end < (inputs_length):
        result.append(create_subdataset(inputs,end,None))

    #catch file loss
    result_length = 0
    for r in result: result_length += len(r.files)
    if result_length != inputs_length:
        raise SplittingError('Data files lost during splitting, please send '\
                             'a bug report to the Ganga team.')

    return result
Exemple #4
0
    def split(self,job):
        if self.evtMaxPerJob > 50000:
            raise SplittingError('evtMaxPerJob is larger than 50000 : %d. Please set a smaller number' % self.evtMaxPerJob)

        self._jobProperties = []

        self._prepare(job)

        if self.seed is None:
            seedStart = self._getSeedStart()
        else:
            seedStart = self.seed

        if self.outputEvtNum:
            f = open(self.outputEvtNum, 'w')
            for jobProperty in self._jobProperties:
                print >>f, '%8d %8d %8d'%(jobProperty['runL'], jobProperty['runH'], jobProperty['eventNum'])
            f.close()

        realTotalNum = 0
        subjobs=[]
        rndmSeed = seedStart
        for jobProperty in self._jobProperties:
            realTotalNum += jobProperty['eventNum']

            subjob = create_gaudi_subjob(job, job.inputdata)

            self._createSimJob(subjob, jobProperty, rndmSeed)
            if job.application.recoptsfile:
                self._createRecJob(subjob, jobProperty, rndmSeed)
                if job.application.anaoptsfile:
                    self._createAnaJob(subjob, jobProperty)

            subjob.application.extra.output_name          = jobProperty['filename']
            subjob.application.extra.metadata['round']    = jobProperty['round']
            subjob.application.extra.metadata['runFrom']  = jobProperty['runFrom']
            subjob.application.extra.metadata['runTo']    = jobProperty['runTo']

            for step in subjob.application.output_step:
                if step in subjob.application.extra.data_type:
                    subjob.application.extra.output_files.append(step + '/' + subjob.application.extra.output_name + '.' + subjob.application.extra.data_type[step])

            subjobs.append(subjob)
            rndmSeed += 1

        (runFrom, runTo) = get_runLH(job.application.extra.run_ranges)
        round = get_round_nums(job.application.extra.run_ranges)[0]
        taskInfo = {}
        taskInfo['SplitterType'] = self.__class__.__name__
        taskInfo['SeedStart'] = seedStart
        taskInfo['TotalEventNum'] = self.evtTotal
        taskInfo['EventMax'] = self.evtMaxPerJob
        taskInfo['RealTotalEventNum'] = realTotalNum
        taskInfo['Round'] = round
        taskInfo['RunFrom'] = runFrom
        taskInfo['RunTo'] = runTo
        gDiracTask.updateTaskInfo(taskInfo)

        return subjobs
def badLFNCheck(bad_lfns, allLFNs, LFNdict, ignoremissing, allLFNData):
    """
    Method to re-sort the LFN replica data and check for bad LFNs

    Args:
        bad_lfns (list): This is the list which will contain LFNs which have no replicas
        allLFNs (list): List of all of the LFNs in the inputs which have accessible replicas
        LFNdict (dict): dict of LFN to DiracFiles
        ignoremissing (bool): Check if we have any bad lfns
        allLFNData (dict): All LFN replica data
    """

    # FIXME here to keep the repo settings as they were before we changed the
    # flush count
    original_write_perm = {}

    global LFN_parallel_limit
    global limit_divide_one

    for i in range(int(math.ceil(float(len(allLFNs)) * limit_divide_one))):
        output = allLFNData.get(i)

        if output is None:
            msg = "Error getting Replica information from Dirac: [%s,%s]" % ( str(i * LFN_parallel_limit), str((i + 1) * LFN_parallel_limit))
            raise SplittingError(msg)

        # Identify files which have Failed to be found by DIRAC
        results = output
        values = results.get('Successful')

        upper_limit = (i+1)*LFN_parallel_limit
        if upper_limit > len(allLFNs):
            upper_limit = len(allLFNs)

        #logger.debug("Updating LFN Physical Locations: [%s:%s] of %s" % (str(i * LFN_parallel_limit), str(upper_limit), str(len(allLFNs))))

        for this_lfn in values.keys():
            #logger.debug("LFN: %s" % str(this_lfn))
            this_dict = {}
            this_dict[this_lfn] = values.get(this_lfn)

            if this_lfn in LFNdict:
                #logger.debug("Updating RemoteURLs")
                LFNdict[this_lfn]._updateRemoteURLs(this_dict)
                #logger.debug("This_dict: %s" % str(this_dict))
            else:
                logger.error("Error updating remoteURLs for: %s" % str(this_lfn))

            # If we find NO replicas then also mark this as bad!
            if this_dict[this_lfn].keys() == []:
                bad_lfns.append(this_lfn)

        for this_lfn in bad_lfns:
            logger.warning("LFN: %s was either unknown to DIRAC or unavailable, Ganga is ignoring it!" % str(this_lfn))
            if this_lfn in LFNdict:
                del LFNdict[this_lfn]
            if this_lfn in allLFNs:
                allLFNs.remove(this_lfn)
Exemple #6
0
def wrapped_execute(command, expected_type):
    """
    A wrapper around execute to protect us from commands which had errors
    """
    try:
        result = execute(command)
        assert isinstance(result, expected_type)
    except AssertionError:
        raise SplittingError(
            "Output from DIRAC expected to be of type: '%s', we got the following: '%s'"
            % (expected_type, result))
    return result
Exemple #7
0
    def _splitFiles(self,inputs):
        # don't let user use this if they're using the Dirac backend
        job = None
        try:
            job = self.getJobObject()
        except:
            pass
        if job:
            if job.backend.__module__.find('Dirac') > 0:
                msg = 'SplitByFiles should not be used w/ the Dirac backend.'\
                      ' You probably want the DiracSplitter.'
                raise SplittingError(msg)

        return simple_split(self.filesPerJob,inputs)
Exemple #8
0
    def _splitFiles(self, inputs):
        from GangaBoss.Lib.Dataset import LogicalFile
        files = []
        for f in inputs.files:
            if isLFN(f):
                files.append(f.name)
                print f.name
            if self.maxFiles > 0 and len(files) >= self.maxFiles: break
        cmd = 'result = DiracCommands.splitInputData(%s,%d)' \
              % (files,self.filesPerJob)
        print cmd
        result = Dirac.execAPI(cmd)
        if not result_ok(result):
            logger.error('Error splitting files: %s' % str(result))
            raise SplittingError('Error splitting files.')
        split_files = result.get('Value', [])
        if len(split_files) == 0:
            raise SplittingError('An unknown error occured.')
        datasets = []
        # check that all files were available on the grid
        big_list = []
        for l in split_files:
            big_list.extend(l)
        diff = set(inputs.getFileNames()).difference(big_list)
        if len(diff) > 0:
            for f in diff:
                logger.warning('Ignored file: %s' % f)
            if not self.ignoremissing:
                raise SplittingError('Some files not found!')

        for l in split_files:
            dataset = BesDataset()
            dataset.depth = inputs.depth
            for file in l:
                dataset.files.append(LogicalFile(file))
            datasets.append(dataset)
        return datasets
Exemple #9
0
    def split(self, job):

        subjobs = []

        for arg in self.args:
            j = self.createSubjob(job,['application'])
            # Add new arguments to subjob
            app = copy.deepcopy(job.application)
            if hasattr(app, 'args'):
                app.args = arg
            elif hasattr(app, 'extraArgs'):
                app.extraArgs = arg
            else:
                raise SplittingError('Application has neither args or extraArgs in its schema') 
                    
            j.application = app
            logger.debug('Arguments for split job is: ' + str(arg))
            subjobs.append(stripProxy(j))

        return subjobs
def lookUpLFNReplicas(inputs, ignoremissing):
    """
    This method launches several worker threads to collect the replica information for all LFNs which are given as inputs and stores this in allLFNData
    Args:
        inputs (list): This is a list of input DiracFile which are 
    Returns:
        allLFNs (list): List of all of the LFNs in the inputs
        LFNdict (dict): dict of LFN to DiracFiles
    """
    allLFNData = {}
    # Build a useful dictionary and list
    allLFNs = [_lfn.lfn for _lfn in inputs]
    LFNdict = dict.fromkeys(allLFNs)
    for _lfn in inputs:
        LFNdict[_lfn.lfn] = _lfn

    # Request the replicas for all LFN 'LFN_parallel_limit' at a time to not overload the
    # server and give some feedback as this is going on
    global limit_divide_one
    for i in range(int(math.ceil(float(len(allLFNs)) * limit_divide_one))):

        getQueues()._monitoring_threadpool.add_function(getLFNReplicas, (allLFNs, i, allLFNData))

    while len(allLFNData) != int(math.ceil(float(len(allLFNs)) * limit_divide_one)):
        time.sleep(1.)
        # This can take a while so lets protect any repo locks
        import Ganga.Runtime.Repository_runtime
        Ganga.Runtime.Repository_runtime.updateLocksNow()

    bad_lfns = []

    # Sort this information and store is in the relevant Ganga objects
    badLFNCheck(bad_lfns, allLFNs, LFNdict, ignoremissing, allLFNData)

    # Check if we have any bad lfns
    if bad_lfns and ignoremissing is False:
        logger.error("Errors found getting LFNs:\n%s" % str(bad_lfns))
        raise SplittingError("Error trying to split dataset with invalid LFN and ignoremissing = False")

    return allLFNs, LFNdict, bad_lfns
Exemple #11
0
    def _splitter(self, job, inputdata):

        logger.debug("_splitter")

        indata = stripProxy(copy.deepcopy(job.inputdata))

        if not job.inputdata:
            share_path = os.path.join(
                expandfilename(getConfig('Configuration')['gangadir']),
                'shared',
                getConfig('Configuration')['user'],
                job.application.is_prepared.name, 'inputdata',
                'options_data.pkl')
            if os.path.exists(share_path):
                f = open(share_path, 'r+b')
                indata = pickle.load(f)
                f.close()
            else:
                logger.error('Cannot split if no inputdata given!')
                raise SplittingError(
                    'job.inputdata is None and no inputdata found in optsfile')

        self.depth = indata.depth
        self.persistency = indata.persistency
        self.XMLCatalogueSlice = indata.XMLCatalogueSlice

        if stripProxy(job.backend).__module__.find('Dirac') > 0:
            if self.filesPerJob > 100:
                self.filesPerJob = 100  # see above warning
            logger.debug("indata: %s " % str(indata))
            outdata = DiracSplitter(indata, self.filesPerJob, self.maxFiles,
                                    self.ignoremissing)
            logger.debug("outdata: %s " % str(outdata))
            return outdata
        else:
            logger.error(
                "This Splitter HAS NOT, yet been implemented for all IGangaFile objects"
            )
            raise NotImplementedError
Exemple #12
0
    def split(self,job):
        if self.filesPerJob < 1:
            logger.error('filesPerJob must be greater than 0.')
            raise SplittingError('filesPerJob < 1 : %d' % self.filesPerJob)

        subjobs=[]
        inputdata = job.inputdata
        if hasattr(job.application,'extra'):
            inputdata = job.application.extra.inputdata
        inputs = BesDataset()
        inputs.depth = inputdata.depth
        if int(self.maxFiles) == -1:
            inputs.files = inputdata.files[:]
            logger.info("Using all %d input files for splitting" % len(inputs))
        else:
            inputs.files = inputdata.files[:self.maxFiles]
            logger.info("Only using a maximum of %d inputfiles"
                        % int(self.maxFiles))

        datasetlist = self._splitFiles(inputs)
        for dataset in datasetlist:
            subjobs.append(create_gaudi_subjob(job,dataset))

        return subjobs
def OfflineGangaDiracSplitter(_inputs, filesPerJob, maxFiles, ignoremissing, bannedSites=[]):
    """
    Generator that yields a datasets for dirac split jobs

    Args:
        _inputs (list): This is a list of input DiracFile objects
        filesPerJob (int): Max files per jobs as defined by splitter
        maxFiles (int): This is the max number of files per subset(subjob)
        ignoremissing (bool): Should we ignore missing LFN
        bannedSites (list): List of banned sites of which the SEs will not be used

    Yields:
        dataset (list): A list of LFNs for each subset(subjob)
    """

    if maxFiles is not None and maxFiles > 0:
        inputs = _inputs[:maxFiles]
    else:
        inputs = _inputs

    # First FIND ALL LFN REPLICAS AND SE<->SITE MAPPINGS AND STORE THIS IN MEMORY
    # THIS IS DONE IN PARALLEL TO AVOID OVERLOADING DIRAC WITH THOUSANDS OF
    # REQUESTS AT ONCE ON ONE CONNECTION

    wanted_common_site = configDirac['OfflineSplitterMaxCommonSites']
    uniqueSE = configDirac['OfflineSplitterUniqueSE']

    if inputs is None:
        raise SplittingError("Cannot Split Job as the inputdata appears to be None!")

    if len(inputs.getLFNs()) != len(inputs.files):
        raise SplittingError("Error trying to split dataset using DIRAC backend with non-DiracFile in the inputdata")

    file_replicas = {}

    logger.info("Requesting LFN replica info")

    # Perform a lookup of where LFNs are all stored
    allLFNs, LFNdict, bad_lfns = lookUpLFNReplicas(inputs, ignoremissing)

    # This finds all replicas for all LFNs...
    # This will probably struggle for LFNs which don't exist
    # Bad LFN should have been removed by this point however
    all_lfns = [LFNdict[this_lfn].locations for this_lfn in LFNdict if this_lfn not in bad_lfns]

    logger.info("Got all good replicas")

    for this_input in inputs:
        if this_input.lfn not in bad_lfns:
            file_replicas[this_input.lfn] = this_input.locations

    logger.info("found all replicas")

    site_to_SE_mapping = {}
    SE_to_site_mapping = {}

    allSubSets = []

    # Now lets generate a dictionary of some chosen site vs LFN to use in
    # constructing subsets
    site_dict = calculateSiteSEMapping(file_replicas, uniqueSE, site_to_SE_mapping, SE_to_site_mapping, bannedSites, ignoremissing)


    allChosenSets = {}
    # Now select a set of site to use as a seed for constructing a subset of
    # LFN
    for lfn in site_dict.keys():
        allChosenSets[lfn] = generate_site_selection(site_dict[lfn], wanted_common_site, uniqueSE, site_to_SE_mapping, SE_to_site_mapping)

    logger.debug("Found all SE in use")


    # BELOW IS WHERE THE ACTUAL SPLITTING IS DONE

    logger.info("Calculating best data subsets")

    allSubSets = performSplitting(site_dict, filesPerJob, allChosenSets, wanted_common_site, uniqueSE, site_to_SE_mapping, SE_to_site_mapping)

    avg = 0.
    for this_set in allSubSets:
        avg += float(len(this_set))
    avg /= float(len(allSubSets))

    logger.info("Average Subset size is: %s" % (str(avg)))

    # FINISHED SPLITTING CHECK!!!
    check_count = 0
    for i in allSubSets:
        check_count = check_count + len(i)

    if check_count != len(inputs) - len(bad_lfns):
        logger.error("SERIOUS SPLITTING ERROR!!!!!")
        raise SplitterError("Files Missing after Splitting!")
    else:
        logger.info("File count checked! Ready to Submit")

    # RETURN THE RESULT

    logger.info("Created %s subsets" % len(allSubSets))

    for dataset in allSubSets:
        yield dataset
Exemple #14
0
def OfflineGangaDiracSplitter(_inputs, filesPerJob, maxFiles, ignoremissing):
    """
    Generator that yields a datasets for dirac split jobs
    """

    if maxFiles is not None and maxFiles > 0:
        inputs = _inputs[:maxFiles]
    else:
        inputs = _inputs

    # First FIND ALL LFN REPLICAS AND SE<->SITE MAPPINGS AND STORE THIS IN MEMORY
    # THIS IS DONE IN PARALLEL TO AVOID OVERLOADING DIRAC WITH THOUSANDS OF
    # REQUESTS AT ONCE ON ONE CONNECTION

    wanted_common_site = configDirac['OfflineSplitterMaxCommonSites']
    iterative_limit = configDirac['OfflineSplitterLimit']
    good_fraction = configDirac['OfflineSplitterFraction']
    uniqueSE = configDirac['OfflineSplitterUniqueSE']

    split_files = []

    if inputs is None:
        raise SplittingError(
            "Cannot Split Job as the inputdata appears to be None!")

    if len(inputs.getLFNs()) != len(inputs.files):
        raise SplittingError(
            "Error trying to split dataset using DIRAC backend with non-DiracFile in the inputdata"
        )

    file_replicas = {}

    logger.info("Requesting LFN replica info")

    allLFNData = {}

    # Perform a lookup of where LFNs are all stored
    allLFNs, LFNdict = lookUpLFNReplicas(inputs, allLFNData)

    for _lfn in allLFNData:
        if allLFNData[_lfn] is None:
            logger.error(
                "Error in Getting LFN Replica information, aborting split")
            raise SplittingError(
                "Error in Getting LFN Replica information, aborting split")

    bad_lfns = []

    # Sort this information and store is in the relevant Ganga objects
    errors = sortLFNreplicas(bad_lfns, allLFNs, LFNdict, ignoremissing,
                             allLFNData, inputs)

    if len(bad_lfns) != 0:
        if ignoremissing is False:
            logger.error("Errors found getting LFNs:\n%s" % str(errors))
            raise SplittingError(
                "Error trying to split dataset with invalid LFN and ignoremissing = False"
            )

    # This finds all replicas for all LFNs...
    # This will probably struggle for LFNs which don't exist
    # Bad LFN should have been removed by this point however
    all_lfns = [
        LFNdict[this_lfn].locations for this_lfn in LFNdict
        if this_lfn not in bad_lfns
    ]

    logger.info("Got replicas")

    for this_input in inputs:
        if this_input.lfn not in bad_lfns:
            file_replicas[this_input.lfn] = this_input.locations

    logger.info("found all replicas")

    logger.info("Calculating site<->SE Mapping")

    site_to_SE_mapping = {}
    SE_to_site_mapping = {}

    # Now lets generate a dictionary of some chosen site vs LFN to use in
    # constructing subsets
    site_dict, allSubSets, allChosenSets = calculateSiteSEMapping(
        file_replicas, wanted_common_site, uniqueSE, site_to_SE_mapping,
        SE_to_site_mapping)

    logger.debug("Found all SE in use")

    # BELOW IS WHERE THE ACTUAL SPLITTING IS DONE

    logger.info("Calculating best data subsets")

    iterations = 0
    # Loop over all LFNs
    while len(site_dict.keys()) > 0:

        # LFN left to be used
        # NB: Can't modify this list and iterate over it directly in python
        LFN_instances = site_dict.keys()
        # Already used LFN
        chosen_lfns = set()

        for iterating_LFN in LFN_instances:

            # If this has previously been selected lets ignore it and move on
            if iterating_LFN in chosen_lfns:
                continue

            # Use this seed to try and construct a subset
            req_sitez = allChosenSets[iterating_LFN]
            _this_subset = []

            #logger.debug("find common LFN for: " + str(allChosenSets[iterating_LFN]))

            # Construct subset
            # Starting with i, populate subset with LFNs which have an
            # overlap of at least 2 SE
            for this_LFN in LFN_instances:
                if this_LFN in chosen_lfns:
                    continue
                if req_sitez.issubset(site_dict[this_LFN]):
                    if len(_this_subset) >= filesPerJob:
                        break
                    _this_subset.append(this_LFN)

            limit = int(math.floor(float(filesPerJob) * good_fraction))

            #logger.debug("Size limit: %s" % str(limit))

            # If subset is too small throw it away
            if len(_this_subset) < limit:
                #logger.debug("%s < %s" % (str(len(_this_subset)), str(limit)))
                allChosenSets[iterating_LFN] = generate_site_selection(
                    site_dict[iterating_LFN], wanted_common_site, uniqueSE,
                    site_to_SE_mapping, SE_to_site_mapping)
                continue
            else:
                logger.debug("found common LFN for: " +
                             str(allChosenSets[iterating_LFN]))
                logger.debug("%s > %s" % (str(len(_this_subset)), str(limit)))
                # else Dataset was large enough to be considered useful
                logger.debug("Generating Dataset of size: %s" %
                             str(len(_this_subset)))
                ## Construct DiracFile here as we want to keep the above combination
                allSubSets.append([
                    DiracFile(lfn=str(this_LFN)) for this_LFN in _this_subset
                ])

                for lfn in _this_subset:
                    site_dict.pop(lfn)
                    allChosenSets.pop(lfn)
                    chosen_lfns.add(lfn)

        # Lets keep track of how many times we've tried this
        iterations = iterations + 1

        # Can take a while so lets not let threads become un-locked
        import Ganga.Runtime.Repository_runtime
        Ganga.Runtime.Repository_runtime.updateLocksNow()

        # If on final run, will exit loop after this so lets try and cleanup
        if iterations >= iterative_limit:

            if good_fraction < 0.5:
                good_fraction = good_fraction * 0.75
                iterations = 0
            elif wanted_common_site > 1:
                logger.debug("Reducing Common Site Size")
                wanted_common_site = wanted_common_site - 1
                iterations = 0
                good_fraction = 0.75
            else:
                good_fraction = good_fraction * 0.75

            logger.debug("good_fraction: %s" % str(good_fraction))

    split_files = allSubSets

    avg = float()
    for this_set in allSubSets:
        avg += float(len(this_set))
    avg /= float(len(allSubSets))

    logger.info("Average Subset size is: %s" % (str(avg)))

    # FINISHED SPLITTING CHECK!!!

    check_count = 0
    for i in split_files:
        check_count = check_count + len(i)

    if check_count != len(inputs) - len(bad_lfns):
        logger.error("SERIOUS SPLITTING ERROR!!!!!")
        raise SplitterError("Files Missing after Splitting!")
    else:
        logger.info("File count checked! Ready to Submit")

    # RETURN THE RESULT

    logger.info("Created %s subsets" % str(len(split_files)))

    #logger.info( "Split Files: %s" % str(split_files) )

    for dataset in split_files:
        yield dataset
Exemple #15
0
 def _splitter(self, job, inputdata):
     if (inputdata is None) or (len(inputdata.files) == 0):
         logger.error('Cannot split if no inputdata given!')
         raise SplittingError('inputdata is None')
     logger.debug("Found %s files inputdata" % str(len(inputdata.files)))
     return DatasetSplitter(inputdata, self.filesPerJob, self.maxFiles)
Exemple #16
0
def DiracSplitter(inputs, filesPerJob, maxFiles, ignoremissing):
    """
    Generator that yields a datasets for dirac split jobs
    """
    #logger.debug( "DiracSplitter" )
    #logger.debug( "inputs: %s" % str( inputs ) )
    split_files = []
    i = inputs.__class__()

    if len(inputs.getLFNs()) != len(inputs.files):
        raise SplittingError(
            "Error trying to split dataset using DIRAC backend with non-DiracFile in the inputdata"
        )

    all_files = igroup(inputs.files[:maxFiles],
                       getConfig('DIRAC')['splitFilesChunks'],
                       leftovers=True)

    #logger.debug( "Looping over all_files" )
    #logger.debug( "%s" % str( all_files ) )

    for files in all_files:

        i.files = files

        LFNsToSplit = i.getLFNs()

        if (len(LFNsToSplit)) > 1:

            result = execute('splitInputData(%s, %d)' %
                             (i.getLFNs(), filesPerJob))

            if not result_ok(result):
                logger.error('DIRAC:: Error splitting files: %s' % str(result))
                raise SplittingError('Error splitting files.')

            split_files += result.get('Value', [])

        else:

            split_files = [LFNsToSplit]

    if len(split_files) == 0:
        raise SplittingError('An unknown error occured.')

    # FIXME
    # check that all files were available on the grid
    big_list = []
    for l in split_files:
        big_list.extend(l)
    diff = set(inputs.getFileNames()[:maxFiles]).difference(big_list)
    if len(diff) > 0:
        for f in diff:
            logger.warning('Ignored file: %s' % f)
        if not ignoremissing:
            raise SplittingError('Some files not found!')
    ###

    logger.debug("Split Files: %s" % str(split_files))

    for _dataset in split_files:
        dataset = []
        for _lfn in _dataset:
            dataset.append(DiracFile(lfn=_lfn))
        yield dataset
Exemple #17
0
def GangaDiracSplitter(inputs, filesPerJob, maxFiles, ignoremissing):
    """
    Generator that yields a datasets for dirac split jobs
    """

    split_files = []
    i = inputs.__class__()

    if len(inputs.getLFNs()) != len(inputs.files):
        raise SplittingError(
            "Error trying to split dataset using DIRAC backend with non-DiracFile in the inputdata")

    file_replicas = {}

    from Ganga.GPI import queues

    for i in inputs:
        #logging.debug( "getting metadata: %s" % str(i.lfn) )
        queues.add(i.getReplicas)

    logger.info("Requesting LFN replica info")

    # This finds all replicas for all LFNs...
    # This will probably struggle for LFNs which don't exist
    all_lfns = [i.locations for i in inputs]
    while [] in all_lfns:
        import time
        time.sleep(0.5)
        all_lfns = [i.locations for i in inputs]

    logger.info("Got replicas")

    for i in inputs:
        file_replicas[i.lfn] = i.locations
        #logger.info( "%s" % str( i.accessURL() ) )

    logger.debug("found all replicas")

    super_dict = dict()
    for lfn, repz in file_replicas.iteritems():
        sitez = set([])
        for i in repz:
            # print i
            sitez.add(i)
        super_dict[lfn] = sitez

    allSubSets = []
    allChosenSets = {}

    logger.info("Determining overlap")

    import random
    for i in super_dict.keys():

        # Randomly Select 2 SE as the starting point for spliting jobs
        if len(super_dict[i]) > 2:
            req_sitez = set([])
            chosen = random.sample(super_dict[i], 2)
            for s in chosen:
                req_sitez.add(s)
        # Keep the 2 or less SE as the SE of choice
        else:
            req_sitez = set([])
            for s in super_dict[i]:
                req_sitez.add(s)

        allChosenSets[i] = req_sitez

    logger.debug("Found all SE in use")

    Tier1Sites = set([])

    for i in super_dict.keys():

        req_sitez = allChosenSets[i]
        _this_subset = []

        # Starting with i, populate subset with LFNs which have an
        # overlap of at least 2 SE

        for k in super_dict.keys():
            if req_sitez.issubset(super_dict[k]):
                if len(_this_subset) >= filesPerJob:
                    break
                _this_subset.append(str(k))
                super_dict.pop(k)

        if len(_this_subset) > 0:
            allSubSets.append(_this_subset)

    split_files = allSubSets

    logger.info("Created %s subsets" % str(len(split_files)))

    #logger.info( "Split Files: %s" % str(split_files) )

    for dataset in split_files:
        yield dataset