Example #1
0
def dirac_parametric_split(app):
    """ Bulk job submission splitter. TODO document more
    Args:
        app (IApplication): Application belonging to the job in question
    """
    data = app.getJobObject().inputdata
    splitter = app.getJobObject().splitter

    split_data = [
        dataset
        for dataset in DiracSplitter(data, splitter.filesPerJob,
                                     splitter.maxFiles, splitter.ignoremissing)
    ]

    split_files = []

    for dataset in split_data:
        this_dataset = []
        for this_file in dataset:
            if isinstance(this_file, DiracFile):
                this_dataset.append(this_file.lfn)
            else:
                raise SplitterError("ERROR: file: %s NOT of type DiracFile" %
                                    str(this_file))
        split_files.append(this_dataset)

    if len(split_files) > 0:
        return split_files

    return None
Example #2
0
def dirac_parametric_split(app):
    data = app.getJobObject().inputdata
    splitter = app.getJobObject().splitter

    split_data = [
        dataset
        for dataset in DiracSplitter(data, splitter.filesPerJob,
                                     splitter.maxFiles, splitter.ignoremissing)
    ]

    split_files = []

    for dataset in split_data:
        this_dataset = []
        for this_file in dataset:
            if isType(this_file, DiracFile):
                this_dataset.append(this_file.lfn)
            else:
                raise SplitterError("ERROR: file: %s NOT of type DiracFile" %
                                    str(this_file))
        split_files.append(this_dataset)

    if len(split_files) > 0:
        return split_files

    return None
Example #3
0
def DiracRunSplitter(inputs, filesPerJob, maxFiles, ignoremissing):
    """
    Generator that yields datasets for dirac split jobs by run
    """

    metadata = inputs.bkMetadata()
    if not result_ok(metadata):
        logger.error('Error getting input metadata: %s' % str(metadata))
        raise SplittingError('Error splitting files.')
    if metadata['Value']['Failed']:
        logger.error('Error getting part of metadata')
        raise SplittingError('Error splitting files.')

    runs = defaultdict(list)
    for lfn, v in metadata['Value']['Successful'].items():
        f = [f for f in inputs.files if f.lfn == lfn][0]
        runs[v['RunNumber']].append(f)
    logger.info('Found %d runs in inputdata' % len(runs))

    for run, files in sorted(runs.items()):
        run_inputs = inputs.__class__()
        run_inputs.files = files
        if len(files) > filesPerJob:
            datasets = list(
                DiracSplitter(run_inputs, filesPerJob, None, ignoremissing))
        else:
            datasets = [files]
        logger.info('Run %d with %d files was split in %d subjobs' %
                    (run, len(files), len(datasets)))
        for ds in datasets:
            yield ds
Example #4
0
    def _splitter(self, job, inputdata):

        logger.debug("_splitter")

        indata = inputdata

        if indata is not None:

            self.depth = indata.depth
            self.persistency = indata.persistency
            self.XMLCatalogueSlice = indata.XMLCatalogueSlice

        else:

            self.depth = 0
            self.persistency = None
            self.XMLCatalogueSlice = None

        if stripProxy(job.backend).__module__.find('Dirac') > 0:

            logger.debug("found Dirac backend")

            if self.filesPerJob > 100:
                self.filesPerJob = 100  # see above warning
            logger.debug("indata: %s " % str(indata))

            if self.splitterBackend == "GangaDiracSplitter":
                from GangaDirac.Lib.Splitters.GangaSplitterUtils import GangaDiracSplitter
                outdata = GangaDiracSplitter(indata, self.filesPerJob,
                                             self.maxFiles, self.ignoremissing)
            elif self.splitterBackend == "OfflineGangaDiracSplitter":
                from GangaDirac.Lib.Splitters.OfflineGangaDiracSplitter import OfflineGangaDiracSplitter
                outdata = OfflineGangaDiracSplitter(indata, self.filesPerJob,
                                                    self.maxFiles,
                                                    self.ignoremissing)
            elif self.splitterBackend == "splitInputDataBySize":
                from GangaLHCb.Lib.Splitters.LHCbSplitterUtils import DiracSizeSplitter
                outdata = DiracSizeSplitter(indata, self.filesPerJob,
                                            self.maxFiles, self.ignoremissing)
            elif self.splitterBackend == "splitInputData":
                indata = stripProxy(copy.deepcopy(inputdata))
                from GangaDirac.Lib.Splitters.SplitterUtils import DiracSplitter
                outdata = DiracSplitter(indata, self.filesPerJob,
                                        self.maxFiles, self.ignoremissing)
            else:
                raise SplitterError("Backend algorithm not selected!")

            logger.debug("outdata: %s " % str(outdata))
            return outdata
        else:
            logger.debug("Calling Parent Splitter as not on Dirac")
            return super(SplitByFiles, self)._splitter(job, indata)
Example #5
0
    def _splitter(self, job, inputdata):

        logger.debug("_splitter")

        indata = stripProxy(copy.deepcopy(job.inputdata))

        if not job.inputdata:
            share_path = os.path.join(
                expandfilename(getConfig('Configuration')['gangadir']),
                'shared',
                getConfig('Configuration')['user'],
                job.application.is_prepared.name, 'inputdata',
                'options_data.pkl')
            if os.path.exists(share_path):
                f = open(share_path, 'r+b')
                indata = pickle.load(f)
                f.close()
            else:
                logger.error('Cannot split if no inputdata given!')
                raise SplittingError(
                    'job.inputdata is None and no inputdata found in optsfile')

        self.depth = indata.depth
        self.persistency = indata.persistency
        self.XMLCatalogueSlice = indata.XMLCatalogueSlice

        if stripProxy(job.backend).__module__.find('Dirac') > 0:
            if self.filesPerJob > 100:
                self.filesPerJob = 100  # see above warning
            logger.debug("indata: %s " % str(indata))
            outdata = DiracSplitter(indata, self.filesPerJob, self.maxFiles,
                                    self.ignoremissing)
            logger.debug("outdata: %s " % str(outdata))
            return outdata
        else:
            logger.error(
                "This Splitter HAS NOT, yet been implemented for all IGangaFile objects"
            )
            raise NotImplementedError
Example #6
0
    def _splitter(self, job, inputdata):
        """
        This is the main method used in splitting by inputdata for Dirac backends

        Args:
            job (Job): This is the master-job object which contains everything we need to split by inputdata
            inputdata (list): List of (DiracFile) objects which we're to use in the splitting

        Returns:
            outdata (generator): a splitter Generator

        """

        logger.debug("_splitter")

        indata = inputdata

        if indata is not None:

            self.depth = indata.depth
            self.persistency = indata.persistency
            self.XMLCatalogueSlice = indata.XMLCatalogueSlice

        else:

            self.depth = 0
            self.persistency = None
            self.XMLCatalogueSlice = None

        if stripProxy(job.backend).__module__.find('Dirac') > 0:
            bannedSites = []
            if 'BannedSites' in job.backend.settings:
                bannedSites = job.backend.settings['BannedSites']
            logger.debug("found Dirac backend")

            if self.filesPerJob > 100:
                self.filesPerJob = 100  # see above warning
            logger.debug("indata: %s " % str(indata))

            if self.splitterBackend == "GangaDiracSplitter":
                from GangaDirac.Lib.Splitters.GangaSplitterUtils import GangaDiracSplitter
                outdata = GangaDiracSplitter(indata,
                                             self.filesPerJob,
                                             self.maxFiles,
                                             self.ignoremissing)
            elif self.splitterBackend == "OfflineGangaDiracSplitter":
                from GangaDirac.Lib.Splitters.OfflineGangaDiracSplitter import OfflineGangaDiracSplitter
                outdata = OfflineGangaDiracSplitter(indata,
                                                    self.filesPerJob,
                                                    self.maxFiles,
                                                    self.ignoremissing,
                                                    bannedSites)
            elif self.splitterBackend == "splitInputDataBySize":
                from GangaLHCb.Lib.Splitters.LHCbSplitterUtils import DiracSizeSplitter
                outdata = DiracSizeSplitter(indata,
                                            self.filesPerJob,
                                            self.maxFiles,
                                            self.ignoremissing)
            elif self.splitterBackend == "splitInputData":
                indata = stripProxy(copy.deepcopy(inputdata))
                from GangaDirac.Lib.Splitters.SplitterUtils import DiracSplitter
                outdata = DiracSplitter(indata,
                                        self.filesPerJob,
                                        self.maxFiles,
                                        self.ignoremissing)
            else:
                raise SplitterError("Backend algorithm not selected!")

            logger.debug("outdata: %s " % str(outdata))
            return outdata
        #If we are not running the jobs on Dirac but are using DiracFiles we want some of the same checks
        elif all(isinstance(this_file, DiracFile) for this_file in indata):
            from GangaDirac.Lib.Splitters.OfflineGangaDiracSplitter import OfflineGangaDiracSplitter
            outdata = OfflineGangaDiracSplitter(indata,
                                              self.filesPerJob,
                                              self.maxFiles,
                                              self.ignoremissing)
            return outdata
        else:
            logger.debug("Calling Parent Splitter as not on Dirac")
            return super(SplitByFiles, self)._splitter(job, indata)