def test_LHCbDataset_isEmpty(self): ds = LHCbDataset() assert ds.isEmpty(), 'dataset is empty' if not getConfig('Output')['ForbidLegacyInput']: ds = make_dataset(['pfn:a']) else: ds = make_new_dataset([LocalFile('/file/path/someFile')]) assert not ds.isEmpty(), 'dataset is not empty'
def data_filter_site(data, sites=('CERN-RDST','CERN-USER'), verbose=False): """ Filter data set retaining only LFNs with replicas at specified site. Requires working dirac: SetupLHCbDirac SetupGanga Arguments: data - LHCbDataset to filter site - List of SE site names. If equal None filter out LFNs without proper replica. Return: filtered LHCbDataset Usage: data2 = LHCbDataset(syrena.data_filter_site(data)) """ from GangaLHCb.Lib.LHCbDataset import LHCbDataset from tempfile import NamedTemporaryFile import subprocess import re import os _tmp_file = NamedTemporaryFile(delete=False) _tmp_name = _tmp_file.name _data_list = "\n".join([_f.name for _f in data]) _tmp_file.write(_data_list) _tmp_file.close() # Lets use external command as it is much faster _out = LHCbDataset() _input = ["dirac-dms-lfn-replicas","-",_tmp_name] _result = subprocess.check_output(_input).splitlines() if verbose: print _result _current = None _msg = None _relfn = re.compile(r"\s+(/lhcb.*?) :(.*)") _resite = re.compile(r"\s+([A-Z0-9\-]+) :.*") for _l in _result: _mlfn = _relfn.search(_l) if _mlfn: _current = _mlfn.group(1) _msg = _mlfn.group(2) if sites is None and _msg != " No allowed replica found": _out.extend(("lfn:%s" % _current,)) elif sites is not None: _msite = _resite.search(_l) if _msite: _site = _msite.group(1) if _site in sites: _out.extend(("lfn:%s" % _current,)) os.unlink(_tmp_name) return _out
def get_input_data(self): '''Collects the user specified input data that the job will process''' data = [] try: opts_input = self.opts_dict['EventSelector']['Input'] data = [f for f in opts_input] except KeyError as err: logger.debug('No inputdata has been defined in the options file.') logger.debug("%s" % str(err)) from Ganga.GPIDev.Base.Filters import allComponentFilters file_filter = allComponentFilters['gangafiles'] all_files = [] for d in data: p1 = d.find('DATAFILE=') + len('DATAFILE=') quote = d[p1] p2 = d.find(quote, p1 + 1) f = d[p1 + 1:p2] this_file = file_filter(f, None) if this_file is None: this_file = LocalFile(name=f) all_files.append(this_file) ds = LHCbDataset(files=all_files, fromRef=True) return ds
def test_LHCbDataset___len__(self): ds = LHCbDataset() assert len(ds) == 0 if not getConfig('Output')['ForbidLegacyInput']: ds = make_dataset(['pfn:a']) else: ds = make_new_dataset([LocalFile('/file/path/someFile')]) assert len(ds) == 1
def createChainUnit(self, parent_units, use_copy_output=True): """Create an output unit given this output data""" # we need a parent job that has completed to get the output files incl_pat_list = [] excl_pat_list = [] for parent in parent_units: if len(parent.active_job_ids) == 0 or parent.status != "completed": return None for inds in self.inputdata: from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput if isType( inds, TaskChainInput ) and inds.input_trf_id == parent._getParent().getID(): incl_pat_list += inds.include_file_mask excl_pat_list += inds.exclude_file_mask # go over the output files and copy the appropriates over as input # files flist = [] import re for parent in parent_units: job = getJobByID(parent.active_job_ids[0]) if job.subjobs: job_list = job.subjobs else: job_list = [job] for sj in job_list: for f in sj.outputfiles: # match any dirac files that are allowed in the file mask if isType(f, DiracFile): if len(incl_pat_list) > 0: for pat in incl_pat_list: if re.search(pat, f.lfn): flist.append("LFN:" + f.lfn) else: flist.append("LFN:" + f.lfn) if len(excl_pat_list) > 0: for pat in excl_pat_list: if re.search( pat, f.lfn) and "LFN:" + f.lfn in flist: flist.remove("LFN:" + f.lfn) # just do one unit that uses all data unit = LHCbUnit() unit.name = "Unit %d" % len(self.units) unit.inputdata = LHCbDataset(files=[DiracFile(lfn=f) for f in flist]) return unit
def __deepcopy__(self, memo=None): l = LHCbAnalysisTransform() l.application = deepcopy(self.application, memo) l.backend = deepcopy(self.backend, memo) l.splitter = deepcopy(self.splitter, memo) l.merger = deepcopy(self.merger, memo) l.query = deepcopy(self.query, memo) l.run_limit = deepcopy(self.run_limit, memo) l.inputsandbox = deepcopy(self.inputsandbox) l.outputsandbox = deepcopy(self.outputsandbox) if self.inputdata: l.inputdata = LHCbDataset() l.inputdata.files = self.inputdata.files[:] if self.outputdata: l.outputdata = OutputData() l.outputdata.files = self.outputdata.files[:] l.name = self.name return l
def getDataset(self): '''Gets the dataset from the bookkeeping for current dict.''' if not self.dict: return None cmd = 'bkQueryDict(%s)' % self.dict result = get_result(cmd, 'BK query error.', 'BK query error.') files = [] value = result['Value'] if 'LFNs' in value: files = value['LFNs'] if not type(files) is list: if 'LFNs' in files: # i.e. a dict of LFN:Metadata files = files['LFNs'].keys() from GangaDirac.Lib.Files.DiracFile import DiracFile this_list = [DiracFile(lfn=f) for f in files] from GangaLHCb.Lib.LHCbDataset import LHCbDataset ds = LHCbDataset(files=this_list, fromRef=True) return addProxy(ds)
def getDataset(self): '''Gets the dataset from the bookkeeping for current path, etc.''' if not self.path: return None if not self.type in ['Path', 'RunsByDate', 'Run', 'Production']: raise GangaException('Type="%s" is not valid.' % self.type) if not self.type is 'RunsByDate': if self.startDate: msg = 'startDate not supported for type="%s".' % self.type raise GangaException(msg) if self.endDate: msg = 'endDate not supported for type="%s".' % self.type raise GangaException(msg) if self.selection: msg = 'selection not supported for type="%s".' % self.type raise GangaException(msg) cmd = "getDataset('%s','%s','%s','%s','%s','%s')" % ( self.path, self.dqflag, self.type, self.startDate, self.endDate, self.selection) from Ganga.GPIDev.Lib.GangaList.GangaList import GangaList knownLists = [tuple, list, GangaList] if isType(self.dqflag, knownLists): cmd = "getDataset('%s',%s,'%s','%s','%s','%s')" % ( self.path, self.dqflag, self.type, self.startDate, self.endDate, self.selection) result = get_result( cmd, 'BK query error.', credential_requirements=self.credential_requirements) logger.debug("Finished Running Command") files = [] value = result if 'LFNs' in value: files = value['LFNs'] if not type(files) is list: # i.e. a dict of LFN:Metadata # if 'LFNs' in files: # i.e. a dict of LFN:Metadata files = files.keys() logger.debug("Creating DiracFile objects") ## Doesn't work not clear why from GangaDirac.Lib.Files.DiracFile import DiracFile #new_files = [] #def _createDiracLFN(this_file): # return DiracFile(lfn = this_file) #GangaObject.__createNewList(new_files, files, _createDiracLFN) logger.debug("Creating new list") new_files = [DiracFile(lfn=f) for f in files] #new_files = [DiracFile(lfn=_file) for _file in files] #for f in files: # new_files.append(DiracFile(lfn=f)) #ds.extend([DiracFile(lfn = f)]) logger.info("Constructing LHCbDataset") from GangaLHCb.Lib.LHCbDataset import LHCbDataset logger.debug("Imported LHCbDataset") ds = LHCbDataset(files=new_files, fromRef=True) logger.debug("Returning Dataset") return addProxy(ds)
def updateQuery(self, resubmit=False): """Update the dataset information of the transforms. This will include any new data in the processing or re-run jobs that have data which has been removed.""" if len(self.queries) == 0: raise GangaException( None, 'Cannot call updateQuery() on an LHCbTransform without any queries' ) if self._getParent() != None: logger.info( 'Retrieving latest bookkeeping information for transform %i:%i, please wait...' % (self._getParent().id, self.getID())) else: logger.info( 'Retrieving latest bookkeeping information for transform, please wait...' ) # check we have an input DS per BK Query while len(self.queries) > len(self.inputdata): self.inputdata.append(LHCbDataset()) # loop over the queries and add fill file lists for id, query in enumerate(self.queries): # Get the latest dataset latest_dataset = query.getDataset() # Compare to previous inputdata, get new and removed logger.info( 'Checking for new and removed data for query %d, please wait...' % self.queries.index(query)) dead_data = LHCbDataset() new_data = LHCbDataset() # loop over the old data and compare new_data.files += latest_dataset.difference( self.inputdata[id]).files dead_data.files += self.inputdata[id].difference( latest_dataset).files # for dead data, find then kill/remove any associated jobs # loop over units and check any associated with this DS # TODO: Follow through chained tasks for unit in self.units: # associted unit if unit.input_datset_index != id: continue # find the job if len(unit.active_job_ids) == 0: continue # check the data for f in dead_data.files: if f in unit.inputdata.files: # kill the job job = getJobByID(unit.active_job_ids[0]) if job.status in ['submitted', 'running']: job.kill() # forget the job unit.prev_job_ids.append(unit.active_job_ids[0]) unit.active_job_ids = [] break # in any case, now just set the DS files to the new set self.inputdata[id].files = [] self.inputdata[id].files = latest_dataset.files
def createUnits(self): """Create new units if required given the inputdata""" # call parent for chaining super(LHCbTransform, self).createUnits() if len(self.inputdata) > 0: # check for conflicting input if self.mc_num_units > 0: logger.warning("Inputdata specified - MC Event info ignored") # loop over input data and see if we need to create any more units import copy for id, inds in enumerate(self.inputdata): if not isType(inds, LHCbDataset): continue # go over the units and see what files have been assigned assigned_data = LHCbDataset() for unit in self.units: if unit.input_datset_index != id: continue assigned_data.files += unit.inputdata.files # any new files new_data = LHCbDataset( files=self.inputdata[id].difference(assigned_data).files) if len(new_data.files) == 0: continue # Create units for these files step = self.files_per_unit if step <= 0: step = len(new_data.files) for num in range(0, len(new_data.files), step): unit = LHCbUnit() unit.name = "Unit %d" % len(self.units) unit.input_datset_index = id self.addUnitToTRF(unit) unit.inputdata = copy.deepcopy(self.inputdata[id]) unit.inputdata.files = [] unit.inputdata.files += new_data.files[num:num + step] elif self.mc_num_units > 0: if len(self.units) == 0: # check for appropriate splitter from GangaLHCb.Lib.Splitters.GaussSplitter import GaussSplitter if not self.splitter or isType(self.splitter, GaussSplitter): logger.warning( "No GaussSplitter specified - first event info ignored" ) # create units for MC generation for i in range(0, self.mc_num_units): unit = LHCbUnit() unit.name = "Unit %d" % len(self.units) self.addUnitToTRF(unit) else: import traceback traceback.print_stack() logger.error( "Please specify either inputdata or MC info for unit generation" )
def update(self, resubmit=False): """Update the dataset information of the transforms. This will include any new data in the processing or re-run jobs that have data which has been removed.""" if self.query is None: raise GangaException( None, 'Cannot call update() on an LHCbTransform without the query attribute set' ) if len(self.toProcess_dataset.files): raise GangaException( None, 'Cannot call update() on an LHCbTransform that has already been updated. There is outstanding data to process, try typing transform.run()' ) # Get the latest dataset logger.info( 'Retrieving latest bookkeeping information for transform %i:%i, please wait...' % (self.task_id, self.transform_id)) latest_dataset = self.query.getDataset() self.toProcess_dataset.files = latest_dataset.files # Compare to previous inputdata, get new and removed logger.info('Checking for new and removed data, please wait...') dead_data = LHCbDataset() if self.inputdata is not None: # Get new files self.toProcess_dataset.files = latest_dataset.difference( self.inputdata).files # Get removed files dead_data.files += self.inputdata.difference(latest_dataset).files # If nothing to be updated then exit # Carry out actions as needed redo_jobs = self._getJobsWithRemovedData(dead_data) new_jobs = len(self.toProcess_dataset.files) if not new_jobs and not redo_jobs: logger.info('Transform %i:%i is already up to date' % (self.task_id, self.transform_id)) return if redo_jobs and not resubmit: logger.info( 'There are jobs with out-of-date datasets, some datafiles must ' 'be removed. Updating will mean loss of existing output and mean that merged data ' 'will change respectively. Due to the permenant nature of this request please recall ' 'update with the True argument as update(True)') self.toProcess_dataset.files = [] return if redo_jobs: self.removed_data.files += dead_data.files for j in redo_jobs: if j.status in [ 'submitting', 'submitted', 'running', 'completing' ]: logger.warning( 'Job \'%s\' as it is still running but is marked for resubmission due to removed data. It will be killed first' % j.fqid) j.kill() # for j in self._getJobsWithRemovedData(self.removed_data): logger.info( 'Resubmitting job \'%s\' as it\'s dataset is out of date.' % j.fqid) j.resubmit() if new_jobs: logger.info( 'Transform %i:%i updated, adding partition %i containing %i more file(s) for processing' % (self.task_id, self.transform_id, len(self._partition_status), len(self.toProcess_dataset.files))) self.setPartitionStatus(len(self._partition_status), 'ready') if self.status != 'new': # After the first time, when transform is running or complete, # calling update will submit the jobs thereby blocking the user # thread self._submitJobs(1) self.inputdata = LHCbDataset(latest_dataset.files)
def __init__(self): super(LHCbAnalysisTransform, self).__init__() self.toProcess_dataset = LHCbDataset() self.removed_data = LHCbDataset()
def make_new_dataset(files): ds = LHCbDataset(files) return ds