Esempio n. 1
0
 def test_LHCbDataset_isEmpty(self):
     ds = LHCbDataset()
     assert ds.isEmpty(), 'dataset is empty'
     if not getConfig('Output')['ForbidLegacyInput']:
         ds = make_dataset(['pfn:a'])
     else:
         ds = make_new_dataset([LocalFile('/file/path/someFile')])
     assert not ds.isEmpty(), 'dataset is not empty'
Esempio n. 2
0
def data_filter_site(data, sites=('CERN-RDST','CERN-USER'), verbose=False):
    """
    Filter data set retaining only LFNs with replicas at specified site.

    Requires working dirac:
    SetupLHCbDirac
    SetupGanga

    Arguments:
    data - LHCbDataset to filter
    site - List of SE site names. If equal None filter out LFNs without proper replica.

    Return: filtered LHCbDataset

    Usage: data2 = LHCbDataset(syrena.data_filter_site(data))
    """
    from GangaLHCb.Lib.LHCbDataset import LHCbDataset
    from tempfile import NamedTemporaryFile
    import subprocess
    import re
    import os

    _tmp_file = NamedTemporaryFile(delete=False)
    _tmp_name = _tmp_file.name
    _data_list = "\n".join([_f.name for _f in data])
    _tmp_file.write(_data_list)
    _tmp_file.close()

    # Lets use external command as it is much faster
    _out = LHCbDataset()
    _input = ["dirac-dms-lfn-replicas","-",_tmp_name]
    _result = subprocess.check_output(_input).splitlines()
    if verbose:
        print _result
    _current = None
    _msg = None
    _relfn = re.compile(r"\s+(/lhcb.*?) :(.*)")
    _resite = re.compile(r"\s+([A-Z0-9\-]+) :.*")
    for _l in _result:
        _mlfn = _relfn.search(_l)
        if _mlfn:
            _current = _mlfn.group(1)
            _msg = _mlfn.group(2)
            if sites is None and _msg != " No allowed replica found":
                _out.extend(("lfn:%s" % _current,))
        elif sites is not None:
            _msite = _resite.search(_l)
            if _msite:
                _site = _msite.group(1)
                if _site in sites:
                    _out.extend(("lfn:%s" % _current,))
    os.unlink(_tmp_name)
    return _out
Esempio n. 3
0
    def get_input_data(self):
        '''Collects the user specified input data that the job will process'''
        data = []
        try:
            opts_input = self.opts_dict['EventSelector']['Input']
            data = [f for f in opts_input]
        except KeyError as err:
            logger.debug('No inputdata has been defined in the options file.')
            logger.debug("%s" % str(err))

        from Ganga.GPIDev.Base.Filters import allComponentFilters
        file_filter = allComponentFilters['gangafiles']

        all_files = []
        for d in data:
            p1 = d.find('DATAFILE=') + len('DATAFILE=')
            quote = d[p1]
            p2 = d.find(quote, p1 + 1)
            f = d[p1 + 1:p2]
            this_file = file_filter(f, None)
            if this_file is None:
                this_file = LocalFile(name=f)
            all_files.append(this_file)

        ds = LHCbDataset(files=all_files, fromRef=True)
        return ds
Esempio n. 4
0
 def test_LHCbDataset___len__(self):
     ds = LHCbDataset()
     assert len(ds) == 0
     if not getConfig('Output')['ForbidLegacyInput']:
         ds = make_dataset(['pfn:a'])
     else:
         ds = make_new_dataset([LocalFile('/file/path/someFile')])
     assert len(ds) == 1
Esempio n. 5
0
    def createChainUnit(self, parent_units, use_copy_output=True):
        """Create an output unit given this output data"""

        # we need a parent job that has completed to get the output files
        incl_pat_list = []
        excl_pat_list = []
        for parent in parent_units:
            if len(parent.active_job_ids) == 0 or parent.status != "completed":
                return None

            for inds in self.inputdata:
                from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput
                if isType(
                        inds, TaskChainInput
                ) and inds.input_trf_id == parent._getParent().getID():
                    incl_pat_list += inds.include_file_mask
                    excl_pat_list += inds.exclude_file_mask

        # go over the output files and copy the appropriates over as input
        # files
        flist = []
        import re
        for parent in parent_units:
            job = getJobByID(parent.active_job_ids[0])
            if job.subjobs:
                job_list = job.subjobs
            else:
                job_list = [job]

            for sj in job_list:
                for f in sj.outputfiles:

                    # match any dirac files that are allowed in the file mask
                    if isType(f, DiracFile):
                        if len(incl_pat_list) > 0:
                            for pat in incl_pat_list:
                                if re.search(pat, f.lfn):
                                    flist.append("LFN:" + f.lfn)
                        else:
                            flist.append("LFN:" + f.lfn)

                        if len(excl_pat_list) > 0:
                            for pat in excl_pat_list:
                                if re.search(
                                        pat,
                                        f.lfn) and "LFN:" + f.lfn in flist:
                                    flist.remove("LFN:" + f.lfn)

        # just do one unit that uses all data
        unit = LHCbUnit()
        unit.name = "Unit %d" % len(self.units)
        unit.inputdata = LHCbDataset(files=[DiracFile(lfn=f) for f in flist])

        return unit
Esempio n. 6
0
 def __deepcopy__(self, memo=None):
     l = LHCbAnalysisTransform()
     l.application = deepcopy(self.application, memo)
     l.backend = deepcopy(self.backend, memo)
     l.splitter = deepcopy(self.splitter, memo)
     l.merger = deepcopy(self.merger, memo)
     l.query = deepcopy(self.query, memo)
     l.run_limit = deepcopy(self.run_limit, memo)
     l.inputsandbox = deepcopy(self.inputsandbox)
     l.outputsandbox = deepcopy(self.outputsandbox)
     if self.inputdata:
         l.inputdata = LHCbDataset()
         l.inputdata.files = self.inputdata.files[:]
     if self.outputdata:
         l.outputdata = OutputData()
         l.outputdata.files = self.outputdata.files[:]
     l.name = self.name
     return l
Esempio n. 7
0
    def getDataset(self):
        '''Gets the dataset from the bookkeeping for current dict.'''
        if not self.dict:
            return None
        cmd = 'bkQueryDict(%s)' % self.dict
        result = get_result(cmd, 'BK query error.', 'BK query error.')
        files = []
        value = result['Value']
        if 'LFNs' in value:
            files = value['LFNs']
        if not type(files) is list:
            if 'LFNs' in files:  # i.e. a dict of LFN:Metadata
                files = files['LFNs'].keys()

        from GangaDirac.Lib.Files.DiracFile import DiracFile
        this_list = [DiracFile(lfn=f) for f in files]

        from GangaLHCb.Lib.LHCbDataset import LHCbDataset
        ds = LHCbDataset(files=this_list, fromRef=True)

        return addProxy(ds)
Esempio n. 8
0
    def getDataset(self):
        '''Gets the dataset from the bookkeeping for current path, etc.'''
        if not self.path:
            return None
        if not self.type in ['Path', 'RunsByDate', 'Run', 'Production']:
            raise GangaException('Type="%s" is not valid.' % self.type)
        if not self.type is 'RunsByDate':
            if self.startDate:
                msg = 'startDate not supported for type="%s".' % self.type
                raise GangaException(msg)
            if self.endDate:
                msg = 'endDate not supported for type="%s".' % self.type
                raise GangaException(msg)
            if self.selection:
                msg = 'selection not supported for type="%s".' % self.type
                raise GangaException(msg)
        cmd = "getDataset('%s','%s','%s','%s','%s','%s')" % (
            self.path, self.dqflag, self.type, self.startDate, self.endDate,
            self.selection)
        from Ganga.GPIDev.Lib.GangaList.GangaList import GangaList
        knownLists = [tuple, list, GangaList]
        if isType(self.dqflag, knownLists):
            cmd = "getDataset('%s',%s,'%s','%s','%s','%s')" % (
                self.path, self.dqflag, self.type, self.startDate,
                self.endDate, self.selection)
        result = get_result(
            cmd,
            'BK query error.',
            credential_requirements=self.credential_requirements)

        logger.debug("Finished Running Command")

        files = []
        value = result
        if 'LFNs' in value:
            files = value['LFNs']
        if not type(files) is list:  # i.e. a dict of LFN:Metadata
            # if 'LFNs' in files: # i.e. a dict of LFN:Metadata
            files = files.keys()

        logger.debug("Creating DiracFile objects")

        ## Doesn't work not clear why
        from GangaDirac.Lib.Files.DiracFile import DiracFile
        #new_files = []
        #def _createDiracLFN(this_file):
        #    return DiracFile(lfn = this_file)
        #GangaObject.__createNewList(new_files, files, _createDiracLFN)

        logger.debug("Creating new list")
        new_files = [DiracFile(lfn=f) for f in files]

        #new_files = [DiracFile(lfn=_file) for _file in files]
        #for f in files:
        #    new_files.append(DiracFile(lfn=f))
        #ds.extend([DiracFile(lfn = f)])

        logger.info("Constructing LHCbDataset")

        from GangaLHCb.Lib.LHCbDataset import LHCbDataset
        logger.debug("Imported LHCbDataset")
        ds = LHCbDataset(files=new_files, fromRef=True)

        logger.debug("Returning Dataset")

        return addProxy(ds)
Esempio n. 9
0
    def updateQuery(self, resubmit=False):
        """Update the dataset information of the transforms. This will
        include any new data in the processing or re-run jobs that have data which
        has been removed."""
        if len(self.queries) == 0:
            raise GangaException(
                None,
                'Cannot call updateQuery() on an LHCbTransform without any queries'
            )

        if self._getParent() != None:
            logger.info(
                'Retrieving latest bookkeeping information for transform %i:%i, please wait...'
                % (self._getParent().id, self.getID()))
        else:
            logger.info(
                'Retrieving latest bookkeeping information for transform, please wait...'
            )

        # check we have an input DS per BK Query
        while len(self.queries) > len(self.inputdata):
            self.inputdata.append(LHCbDataset())

        # loop over the queries and add fill file lists
        for id, query in enumerate(self.queries):

            # Get the latest dataset
            latest_dataset = query.getDataset()

            # Compare to previous inputdata, get new and removed
            logger.info(
                'Checking for new and removed data for query %d, please wait...'
                % self.queries.index(query))
            dead_data = LHCbDataset()
            new_data = LHCbDataset()

            # loop over the old data and compare
            new_data.files += latest_dataset.difference(
                self.inputdata[id]).files
            dead_data.files += self.inputdata[id].difference(
                latest_dataset).files

            # for dead data, find then kill/remove any associated jobs
            # loop over units and check any associated with this DS
            # TODO: Follow through chained tasks
            for unit in self.units:
                # associted unit
                if unit.input_datset_index != id:
                    continue

                # find the job
                if len(unit.active_job_ids) == 0:
                    continue

                # check the data
                for f in dead_data.files:
                    if f in unit.inputdata.files:

                        # kill the job
                        job = getJobByID(unit.active_job_ids[0])
                        if job.status in ['submitted', 'running']:
                            job.kill()

                        # forget the job
                        unit.prev_job_ids.append(unit.active_job_ids[0])
                        unit.active_job_ids = []
                        break

            # in any case, now just set the DS files to the new set
            self.inputdata[id].files = []
            self.inputdata[id].files = latest_dataset.files
Esempio n. 10
0
    def createUnits(self):
        """Create new units if required given the inputdata"""

        # call parent for chaining
        super(LHCbTransform, self).createUnits()

        if len(self.inputdata) > 0:

            # check for conflicting input
            if self.mc_num_units > 0:
                logger.warning("Inputdata specified - MC Event info ignored")

            # loop over input data and see if we need to create any more units
            import copy
            for id, inds in enumerate(self.inputdata):

                if not isType(inds, LHCbDataset):
                    continue

                # go over the units and see what files have been assigned
                assigned_data = LHCbDataset()
                for unit in self.units:

                    if unit.input_datset_index != id:
                        continue

                    assigned_data.files += unit.inputdata.files

                # any new files
                new_data = LHCbDataset(
                    files=self.inputdata[id].difference(assigned_data).files)

                if len(new_data.files) == 0:
                    continue

                # Create units for these files
                step = self.files_per_unit
                if step <= 0:
                    step = len(new_data.files)

                for num in range(0, len(new_data.files), step):
                    unit = LHCbUnit()
                    unit.name = "Unit %d" % len(self.units)
                    unit.input_datset_index = id
                    self.addUnitToTRF(unit)
                    unit.inputdata = copy.deepcopy(self.inputdata[id])
                    unit.inputdata.files = []
                    unit.inputdata.files += new_data.files[num:num + step]

        elif self.mc_num_units > 0:
            if len(self.units) == 0:
                # check for appropriate splitter
                from GangaLHCb.Lib.Splitters.GaussSplitter import GaussSplitter
                if not self.splitter or isType(self.splitter, GaussSplitter):
                    logger.warning(
                        "No GaussSplitter specified - first event info ignored"
                    )

                # create units for MC generation
                for i in range(0, self.mc_num_units):
                    unit = LHCbUnit()
                    unit.name = "Unit %d" % len(self.units)
                    self.addUnitToTRF(unit)
        else:
            import traceback
            traceback.print_stack()
            logger.error(
                "Please specify either inputdata or MC info for unit generation"
            )
Esempio n. 11
0
    def update(self, resubmit=False):
        """Update the dataset information of the transforms. This will
        include any new data in the processing or re-run jobs that have data which
        has been removed."""
        if self.query is None:
            raise GangaException(
                None,
                'Cannot call update() on an LHCbTransform without the query attribute set'
            )
        if len(self.toProcess_dataset.files):
            raise GangaException(
                None,
                'Cannot call update() on an LHCbTransform that has already been updated. There is outstanding data to process, try typing transform.run()'
            )

        # Get the latest dataset
        logger.info(
            'Retrieving latest bookkeeping information for transform %i:%i, please wait...'
            % (self.task_id, self.transform_id))
        latest_dataset = self.query.getDataset()
        self.toProcess_dataset.files = latest_dataset.files

        # Compare to previous inputdata, get new and removed
        logger.info('Checking for new and removed data, please wait...')
        dead_data = LHCbDataset()
        if self.inputdata is not None:
            # Get new files
            self.toProcess_dataset.files = latest_dataset.difference(
                self.inputdata).files
            # Get removed files
            dead_data.files += self.inputdata.difference(latest_dataset).files
            # If nothing to be updated then exit

        # Carry out actions as needed
        redo_jobs = self._getJobsWithRemovedData(dead_data)
        new_jobs = len(self.toProcess_dataset.files)
        if not new_jobs and not redo_jobs:
            logger.info('Transform %i:%i is already up to date' %
                        (self.task_id, self.transform_id))
            return

        if redo_jobs and not resubmit:
            logger.info(
                'There are jobs with out-of-date datasets, some datafiles must '
                'be removed. Updating will mean loss of existing output and mean that merged data '
                'will change respectively. Due to the permenant nature of this request please recall '
                'update with the True argument as update(True)')
            self.toProcess_dataset.files = []
            return

        if redo_jobs:
            self.removed_data.files += dead_data.files
            for j in redo_jobs:
                if j.status in [
                        'submitting', 'submitted', 'running', 'completing'
                ]:
                    logger.warning(
                        'Job \'%s\' as it is still running but is marked for resubmission due to removed data. It will be killed first'
                        % j.fqid)
                    j.kill()
                # for j in self._getJobsWithRemovedData(self.removed_data):
                logger.info(
                    'Resubmitting job \'%s\' as it\'s dataset is out of date.'
                    % j.fqid)
                j.resubmit()

        if new_jobs:
            logger.info(
                'Transform %i:%i updated, adding partition %i containing %i more file(s) for processing'
                %
                (self.task_id, self.transform_id, len(self._partition_status),
                 len(self.toProcess_dataset.files)))
            self.setPartitionStatus(len(self._partition_status), 'ready')
            if self.status != 'new':
                # After the first time, when transform is running or complete,
                # calling update will submit the jobs thereby blocking the user
                # thread
                self._submitJobs(1)
        self.inputdata = LHCbDataset(latest_dataset.files)
Esempio n. 12
0
 def __init__(self):
     super(LHCbAnalysisTransform, self).__init__()
     self.toProcess_dataset = LHCbDataset()
     self.removed_data = LHCbDataset()
Esempio n. 13
0
def make_new_dataset(files):
    ds = LHCbDataset(files)
    return ds