class ITransform(GangaObject): _schema = Schema(Version(1, 0), { 'status': SimpleItem(defvalue='new', protected=1, copyable=1, doc='Status - running, pause or completed', typelist=[str]), 'name': SimpleItem(defvalue='Simple Transform', doc='Name of the transform (cosmetic)', typelist=[str]), 'application': ComponentItem('applications', defvalue=None, optional=1, load_default=False, doc='Application of the Transform.'), 'inputsandbox': FileItem(defvalue=[], sequence=1, doc="list of File objects shipped to the worker node "), 'outputsandbox': SimpleItem(defvalue=[], typelist=[str], sequence=1, doc="list of filenames or patterns shipped from the worker node"), 'backend': ComponentItem('backends', defvalue=None, optional=1, load_default=False, doc='Backend of the Transform.'), 'splitter': ComponentItem('splitters', defvalue=None, optional=1, load_default=False, doc='Splitter used on each unit of the Transform.'), 'postprocessors': ComponentItem('postprocessor', defvalue=None, doc='list of postprocessors to run after job has finished'), 'merger': ComponentItem('mergers', defvalue=None, hidden=1, copyable=0, load_default=0, optional=1, doc='Merger to be done over all units when complete.'), 'unit_merger': ComponentItem('mergers', defvalue=None, load_default=0, optional=1, doc='Merger to be copied and run on each unit separately.'), 'copy_output': ComponentItem('datasets', defvalue=None, load_default=0, optional=1, doc='The dataset to copy all units output to, e.g. Grid dataset -> Local Dataset'), 'unit_copy_output': ComponentItem('datasets', defvalue=None, load_default=0, optional=1, doc='The dataset to copy each individual unit output to, e.g. Grid dataset -> Local Dataset'), 'run_limit': SimpleItem(defvalue=8, doc='Number of times a partition is tried to be processed.', protected=1, typelist=[int]), 'minor_run_limit': SimpleItem(defvalue=3, doc='Number of times a unit can be resubmitted', protected=1, typelist=[int]), 'major_run_limit': SimpleItem(defvalue=3, doc='Number of times a junit can be rebrokered', protected=1, typelist=[int]), 'units': ComponentItem('units', defvalue=[], sequence=1, copyable=1, doc='list of units'), 'inputdata': ComponentItem('datasets', defvalue=[], sequence=1, protected=1, optional=1, load_default=False, doc='Input datasets to run over'), 'outputdata': ComponentItem('datasets', defvalue=None, optional=1, load_default=False, doc='Output dataset template'), 'inputfiles': GangaFileItem(defvalue=[], sequence=1, doc="list of file objects that will act as input files for a job"), 'outputfiles' : GangaFileItem(defvalue=[], sequence=1, doc="list of OutputFile objects to be copied to all jobs"), 'metadata': ComponentItem('metadata', defvalue=MetadataDict(), doc='the metadata', protected=1), 'rebroker_on_job_fail': SimpleItem(defvalue=True, doc='Rebroker if too many minor resubs'), 'abort_loop_on_submit': SimpleItem(defvalue=True, doc='Break out of the Task Loop after submissions'), 'required_trfs': SimpleItem(defvalue=[], typelist=[int], sequence=1, doc="IDs of transforms that must complete before this unit will start. NOTE DOESN'T COPY OUTPUT DATA TO INPUT DATA. Use TaskChainInput Dataset for that."), 'chain_delay': SimpleItem(defvalue=0, doc='Minutes delay between a required/chained unit completing and starting this one', protected=0, typelist=[int]), 'submit_with_threads': SimpleItem(defvalue=False, doc='Use Ganga Threads for submission'), 'max_active_threads': SimpleItem(defvalue=10, doc='Maximum number of Ganga Threads to use. Note that the number of simultaneous threads is controlled by the queue system (default is 5)'), 'info' : SimpleItem(defvalue=[],typelist=[str],protected=1,sequence=1,doc="Info showing status transitions and unit info"), 'id': SimpleItem(defvalue=-1, protected=1, doc='ID of the Transform', typelist=[int]), #'force_single_unit' : SimpleItem(defvalue=False, doc='Force all input data into one Unit'), }) _category = 'transforms' _name = 'ITransform' _exportmethods = ['addInputData', 'resetUnit', 'setRunLimit', 'getJobs', 'setMinorRunLimit', 'setMajorRunLimit', 'getID', 'overview', 'resetUnitsByStatus', 'removeUnusedJobs', 'showInfo', 'showUnitInfo', 'pause', 'n_all', 'n_status' ] _hidden = 0 def showInfo(self): """Print out the info in a nice way""" print("\n".join( self.info )) def showUnitInfo(self, uid): """Print out the given unit info in a nice way""" self.units[uid].showInfo() def getJobs(self): """Return a list of the currently active job ids""" joblist = [] for u in self.units: joblist += u.active_job_ids return joblist def setMinorRunLimit(self, newRL): """Set the number of times a job will be resubmitted before a major resubmit is attempted""" self.minor_run_limit = newRL def setMajorRunLimit(self, newRL): """Set the number of times a job will be rebrokered before the transform is paused""" self.major_run_limit = newRL def setRunLimit(self, newRL): """Set the total (minor+major) number of times a job should be resubmitted before the transform is paused""" self.run_limit = newRL def overview(self, status=''): """Show the status of the units in this transform""" for unit in self.units: # display colour given state o = "" o += ("%d: " % self.units.index(unit)) + unit.name # is unit active? if unit.active: o += " " * (40 - len(o) + 3) + "*" else: o += " " * (40 - len(o) + 3) + "-" # sub job status o += "\t %i" % unit.n_status("submitted") o += "\t %i" % unit.n_status("running") o += "\t %i" % unit.n_status("completed") o += "\t %i" % unit.n_status("failed") o += "\t %i" % unit.minor_resub_count o += "\t %i" % unit.major_resub_count # change colour on state if unit.status == 'completed': o = markup(o, overview_colours["completed"]) elif not unit.active: o = markup(o, overview_colours["bad"]) elif unit.status == "recreating": o = markup(o, overview_colours["attempted"]) elif len(unit.active_job_ids) == 0: o = markup(o, overview_colours["hold"]) else: o = markup(o, overview_colours["running"]) print(o) # Special methods: def __init__(self): super(ITransform, self).__init__() self.initialize() def _auto__init__(self): self.status = 'new' def _readonly(self): """A transform is read-only if the status is not new.""" if self.status == "new": return 0 return 1 def initialize(self): from Ganga.Lib.Localhost.Localhost import Localhost self.backend = Localhost() def check(self): """Check this transform has valid data, etc. and has the correct units""" # ignore anything but new transforms if self.status != "new": return # first, validate the transform if not self.validate(): raise ApplicationConfigurationError( None, "Validate failed for Transform %s" % self.name) self.updateStatus("running") def startup(self): """This function is used to set the status after restarting Ganga""" pass # Public methods def resetUnit(self, uid): """Reset the given unit""" addInfoString( self, "Reseting Unit %i" % ( uid ) ) for u in self.units: if u.getID() == uid: u.reset() break # find any chained units and mark for recreation for trf in self._getParent().transforms: for u2 in trf.units: for req in u2.req_units: if req == "%d:%d" % (self.getID(), u.getID()) or req == "%d:ALL" % (self.getID()): trf.resetUnit(u2.getID()) self.updateStatus("running") def getID(self): """Return the index of this trf in the parent task""" # if the id isn't already set, use the index from the parent Task if self.id < 0: task = self._getParent() if not task: raise ApplicationConfigurationError( None, "This transform has not been associated with a task and so there is no ID available") self.id = task.transforms.index(self) return self.id def run(self, check=True): """Sets this transform to running status""" if self.status == "new" and check: self.check() if self.status != "completed": self.updateStatus("running") task = self._getParent() if task: task.updateStatus() else: logger.warning("Transform is already completed!") def update(self): """Called by the parent task to check for status updates, submit jobs, etc.""" if self.status == "pause" or self.status == "new": return 0 # check for complete required units task = self._getParent() for trf_id in self.required_trfs: if task.transforms[trf_id].status != "completed": return 0 # set the start time if not already set if len(self.required_trfs) > 0 and self.units[0].start_time == 0: for unit in self.units: unit.start_time = time.time() + self.chain_delay * 60 - 1 # report the info for this transform unit_status = { "new":0, "hold":0, "running":0, "completed":0, "bad":0, "recreating":0 } for unit in self.units: unit_status[unit.status] += 1 info_str = "Unit overview: %i units, %i new, %i hold, %i running, %i completed, %i bad. to_sub %i" % (len(self.units), unit_status["new"], unit_status["hold"], unit_status["running"], unit_status["completed"], unit_status["bad"], self._getParent().n_tosub()) addInfoString(self, info_str) # ask the unit splitter if we should create any more units given the # current data self.createUnits() # loop over units and update them ((re)submits will be called here) old_status = self.status unit_status_list = [] # find submissions first unit_update_list = [] for unit in self.units: if not unit.checkForSubmission() and not unit.checkForResubmission(): unit_update_list.append(unit) continue if unit.update() and self.abort_loop_on_submit: logger.info("Unit %d of transform %d, Task %d has aborted the loop" % ( unit.getID(), self.getID(), task.id)) return 1 unit_status_list.append(unit.status) # now check for download for unit in unit_update_list: if unit.update() and self.abort_loop_on_submit: logger.info("Unit %d of transform %d, Task %d has aborted the loop" % ( unit.getID(), self.getID(), task.id)) return 1 unit_status_list.append(unit.status) from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput # check for any TaskChainInput completions for ds in self.inputdata: if isType(ds, TaskChainInput) and ds.input_trf_id != -1: if task.transforms[ds.input_trf_id].status != "completed": return 0 # update status and check for state in ['running', 'hold', 'bad', 'completed']: if state in unit_status_list: if state == 'hold': state = "running" if state != self.status: self.updateStatus(state) break def createUnits(self): """Create new units if required given the inputdata""" from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput # check for chaining for ds in self.inputdata: if isType(ds, TaskChainInput) and ds.input_trf_id != -1: # check for single unit if ds.single_unit: # is there a unit already linked? done = False rec_unit = None for out_unit in self.units: if '%d:ALL' % (ds.input_trf_id) in out_unit.req_units: done = True # check if the unit is being recreated if out_unit.status == "recreating": rec_unit = out_unit break if not done or rec_unit: new_unit = self.createChainUnit( self._getParent().transforms[ds.input_trf_id].units, ds.use_copy_output) if new_unit: self.addChainUnitToTRF( new_unit, ds, -1, prev_unit=rec_unit) else: # loop over units in parent trf and create units as # required for in_unit in self._getParent().transforms[ds.input_trf_id].units: # is there a unit already linked? done = False rec_unit = None for out_unit in self.units: if '%d:%d' % (ds.input_trf_id, in_unit.getID()) in out_unit.req_units: done = True # check if the unit is being recreated if out_unit.status == "recreating": rec_unit = out_unit break if not done or rec_unit: new_unit = self.createChainUnit( [in_unit], ds.use_copy_output) if new_unit: self.addChainUnitToTRF( new_unit, ds, in_unit.getID(), prev_unit=rec_unit) def createChainUnit(self, parent_units, use_copy_output=True): """Create a chained unit given the parent outputdata""" return IUnit() def addChainUnitToTRF(self, unit, inDS, unit_id=-1, prev_unit=None): """Add a chained unit to this TRF. Override for more control""" if unit_id == -1: unit.req_units.append('%d:ALL' % (inDS.input_trf_id)) unit.name = "Parent: TRF %d, All Units" % (inDS.input_trf_id) else: unit.req_units.append('%d:%d' % (inDS.input_trf_id, unit_id)) unit.name = "Parent: TRF %d, Unit %d" % ( inDS.input_trf_id, unit_id) self.addUnitToTRF(unit, prev_unit) def addInputData(self, inDS): """Add the given input dataset to the list""" self.inputdata.append(inDS) def pause(self): """Pause the task - the background thread will not submit new jobs from this task""" if self.status != "completed": self.updateStatus("pause") #self.status = "pause" task = self._getParent() if task: task.updateStatus() else: logger.debug("Transform is already completed!") def setRunlimit(self, newRL): """Set the number of times a job should be resubmitted before the transform is paused""" self.run_limit = newRL logger.debug("Runlimit set to %i", newRL) # Methods that can/should be overridden by derived classes def validate(self): """Override this to validate that the transform is OK""" from Ganga.GPIDev.Lib.Tasks.TaskLocalCopy import TaskLocalCopy # make sure a path has been selected for any local downloads if self.unit_copy_output is not None and isType(self.unit_copy_output, TaskLocalCopy): if self.unit_copy_output.local_location == '': logger.error("No path selected for Local Output Copy") return False if self.copy_output is not None and isType(self.copy_output, TaskLocalCopy): if self.copy_output.local_location == '': logger.error("No path selected for Local Output Copy") return False # this is a generic trf so assume the application and splitter will do # all the work return True def addUnitToTRF(self, unit, prev_unit=None): """Add a unit to this Transform given the input and output data""" if not unit: raise ApplicationConfigurationError(None, "addUnitTOTRF failed for Transform %d (%s): No unit specified" % (self.getID(), self.name)) addInfoString( self, "Adding Unit to TRF...") unit.updateStatus("hold") unit.active = True if prev_unit: unit.prev_job_ids += prev_unit.prev_job_ids self.units[prev_unit.getID()] = unit else: self.units.append(unit) stripProxy(unit).id = len(self.units) - 1 # Information methods def fqn(self): task = self._getParent() if task: return "Task %i Transform %i" % (task.id, task.transforms.index(self)) else: return "Unassigned Transform '%s'" % (self.name) def n_active(self): return sum([u.n_active() for u in self.units]) def n_all(self): return sum([u.n_all() for u in self.units]) def n_status(self, status): return sum([u.n_status(status) for u in self.units]) def info(self): logger.info(markup("%s '%s'" % (getName(self), self.name), status_colours[self.status])) logger.info("* backend: %s" % getName(self.backend)) logger.info("Application:") self.application.printTree() def updateStatus(self, status): """Update the transform status""" self.status = status def createUnitCopyOutputDS(self, unit_id): """Create a the Copy Output dataset to use with this unit. Overload to handle more than the basics""" from Ganga.GPIDev.Lib.Tasks.TaskLocalCopy import TaskLocalCopy if isType(self.unit_copy_output, TaskLocalCopy): logger.warning("Default implementation of createUnitCopyOutputDS can't handle datasets of type '%s'" % getName(self.unit_copy_output)) return # create copies of the Copy Output DS and add Unit name to path self.units[unit_id].copy_output = self.unit_copy_output.clone() self.units[unit_id].copy_output.local_location = os.path.join( self.unit_copy_output.local_location, self.units[unit_id].name.replace(":", "_").replace(" ", "").replace(",", "_")) def __setattr__(self, attr, value): if attr == 'outputfiles': if value != []: if self.outputdata is not None: logger.error( 'ITransform.outputdata is set, you can\'t set ITransform.outputfiles') return elif self.outputsandbox != []: logger.error( 'ITransform.outputsandbox is set, you can\'t set ITransform.outputfiles') return # reduce duplicate values here, leave only duplicates for LCG, # where we can have replicas uniqueValuesDict = [] uniqueValues = [] for val in value: key = '%s%s' % (getName(val), val.namePattern) if key not in uniqueValuesDict: uniqueValuesDict.append(key) uniqueValues.append(val) elif getName(val) == 'LCGSEFile': uniqueValues.append(val) super(ITransform, self).__setattr__(attr, uniqueValues) elif attr == 'inputfiles': if value != []: if self.inputsandbox != []: logger.error( 'ITransform.inputsandbox is set, you can\'t set ITransform.inputfiles') return super(ITransform, self).__setattr__(attr, value) elif attr == 'outputsandbox': if value != []: if getConfig('Output')['ForbidLegacyOutput']: logger.error( 'Use of ITransform.outputsandbox is forbidden, please use ITransform.outputfiles') return if self.outputfiles != []: logger.error( 'ITransform.outputfiles is set, you can\'t set ITransform.outputsandbox') return super(ITransform, self).__setattr__(attr, value) elif attr == 'inputsandbox': if value != []: if getConfig('Output')['ForbidLegacyInput']: logger.error( 'Use of ITransform.inputsandbox is forbidden, please use ITransform.inputfiles') return if self.inputfiles != []: logger.error( 'ITransform.inputfiles is set, you can\'t set ITransform.inputsandbox') return super(ITransform, self).__setattr__(attr, value) elif attr == 'outputdata': if value is not None: if getConfig('Output')['ForbidLegacyOutput']: logger.error( 'Use of ITransform.outputdata is forbidden, please use ITransform.outputfiles') return if self.outputfiles != []: logger.error( 'ITransform.outputfiles is set, you can\'t set ITransform.outputdata') return super(ITransform, self).__setattr__(attr, value) else: super(ITransform, self).__setattr__(attr, value) def resetUnitsByStatus(self, status='bad'): """Reset all units of a given status""" for unit in self.units: if unit.status == status: logger.info("Resetting Unit %d, Transform %d..." % (unit.getID(), self.getID())) self.resetUnit(unit.getID()) def checkUnitsAreCompleted(self, parent_units): """Check the given parent units are complete""" for parent in parent_units: if len(parent.active_job_ids) == 0 or parent.status != "completed": return False return True def getChainInclExclMasks(self, parent_units): """return the include/exclude masks from the TaskChainInput""" incl_pat_list = [] excl_pat_list = [] from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput for parent in parent_units: for inds in self.inputdata: if isType(inds, TaskChainInput) and inds.input_trf_id == parent._getParent().getID(): incl_pat_list += inds.include_file_mask excl_pat_list += inds.exclude_file_mask return incl_pat_list, excl_pat_list def getParentUnitJobs(self, parent_units, include_subjobs=True): """Return the list of parent jobs""" job_list = [] for parent in parent_units: job = getJobByID(parent.active_job_ids[0]) if job.subjobs: job_list += job.subjobs else: job_list += [job] return job_list def removeUnusedJobs(self): """Remove all jobs that aren't being used, e.g. failed jobs""" for unit in self.units: for jid in unit.prev_job_ids: try: logger.warning("Removing job '%d'..." % jid) job = getJobByID(jid) job.remove() except Exception as err: logger.debug("removeUnused: %s" % str(err)) logger.error("Problem removing job '%d'" % jid)
class GangaDataset(Dataset): '''Class for handling generic datasets of input files ''' schema = {} docstr = 'List of File objects' schema['files'] = GangaFileItem( defvalue=[], sequence=1, doc="list of file objects that will be the inputdata for the job") schema['treat_as_inputfiles'] = SimpleItem( defvalue=False, doc= "Treat the inputdata as inputfiles, i.e. copy the inputdata to the WN") _schema = Schema(Version(3, 0), schema) _category = 'datasets' _name = "GangaDataset" _exportmethods = [ 'append', 'extend', '__len__', 'isEmtpy', 'getFileNames', 'getFilenameList', '__getitem__', '__nonzero__', 'isEmpty', 'getFileNames', 'getFilenameList', 'difference', 'isSubset', 'isSuperset', 'symmetricDifference', 'intersection', 'union' ] def __init__(self, files=None): if files is None: files = [] super(GangaDataset, self).__init__() self.files = files def __len__(self): """The number of files in the dataset.""" result = 0 if self.files: result = len(self.files) return result def __nonzero__(self): """This is always True, as with an object.""" return True def __getitem__(self, i): '''Proivdes scripting (e.g. ds[2] returns the 3rd file) ''' if isinstance(i, type(slice(0))): ds = GangaDataset(files=self.files[i]) return ds else: return self.files[i] def isEmpty(self): return not bool(self.files) def append(self, input_file): self.extend([input_file]) def extend(self, files, unique=False): '''Extend the dataset. If unique, then only add files which are not already in the dataset.''' from Ganga.GPIDev.Base import ReadOnlyObjectError if not hasattr(files, "__getitem__"): raise GangaException('Argument "files" must be a iterable.') if self._getParent() is not None and self._getParent()._readonly(): raise ReadOnlyObjectError( 'object Job#%s is read-only and attribute "%s/inputdata" cannot be modified now' % (self._getParent().id, getName(self))) names = self.getFileNames() files = [f for f in files] # just in case they extend w/ self for f in files: if unique and f.name in names: continue self.files.append(f) def getFileNames(self): 'Returns a list of the names of all files stored in the dataset.' names = [] for i in self.files: if hasattr(i, 'lfn'): names.append(i.lfn) else: try: names.append(i.namePattern) except: logger.warning("Cannot determine filename for: %s " % i) raise GangaException("Cannot Get File Name") return names def getFilenameList(self): "return a list of filenames to be created as input.txt on the WN" filelist = [] for f in self.files: if hasattr(f, 'accessURL'): filelist += f.accessURL() elif hasattr(f, 'getFilenameList'): filelist += f.getFilenameList() else: if isinstance(f, GangaObject): logger.warning( "accessURL or getFilenameList not implemented for File '%s'" % getName(f)) else: logger.warning("Warning, not sure how to parse file: %s" % str(f)) return filelist def difference(self, other): '''Returns a new data set w/ files in this that are not in other.''' other_files = other.getFullFileNames() files = set(self.getFullFileNames()).difference(other_files) data = GangaDataset() data.extend([list(files)]) data.depth = self.depth return data def isSubset(self, other): '''Is every file in this data set in other?''' return set(self.getFileNames()).issubset(other.getFileNames()) def isSuperset(self, other): '''Is every file in other in this data set?''' return set(self.getFileNames()).issuperset(other.getFileNames()) def symmetricDifference(self, other): '''Returns a new data set w/ files in either this or other but not both.''' other_files = other.getFullFileNames() files = set(self.getFullFileNames()).symmetric_difference(other_files) data = GangaDataset() data.extend([list(files)]) data.depth = self.depth return data def intersection(self, other): '''Returns a new data set w/ files common to this and other.''' other_files = other.getFullFileNames() files = set(self.getFullFileNames()).intersection(other_files) data = GangaDataset() data.extend([list(files)]) data.depth = self.depth return data def union(self, other): '''Returns a new data set w/ files from this and other.''' files = set(self.getFullFileNames()).union(other.getFullFileNames()) data = GangaDataset() data.extend([list(files)]) data.depth = self.depth return data
class IUnit(GangaObject): _schema = Schema( Version(1, 0), { 'status': SimpleItem(defvalue='new', protected=1, copyable=0, doc='Status - running, pause or completed', typelist=["str"]), 'name': SimpleItem(defvalue='Simple Unit', doc='Name of the unit (cosmetic)', typelist=["str"]), 'application': ComponentItem('applications', defvalue=None, optional=1, load_default=False, doc='Application of the Transform.'), 'inputdata': ComponentItem('datasets', defvalue=None, optional=1, load_default=False, doc='Input dataset'), 'outputdata': ComponentItem('datasets', defvalue=None, optional=1, load_default=False, doc='Output dataset'), 'active': SimpleItem(defvalue=False, hidden=1, doc='Is this unit active'), 'active_job_ids': SimpleItem(defvalue=[], typelist=['int'], sequence=1, hidden=1, doc='Active job ids associated with this unit'), 'prev_job_ids': SimpleItem(defvalue=[], typelist=['int'], sequence=1, hidden=1, doc='Previous job ids associated with this unit'), 'minor_resub_count': SimpleItem(defvalue=0, hidden=1, doc='Number of minor resubmits'), 'major_resub_count': SimpleItem(defvalue=0, hidden=1, doc='Number of major resubmits'), 'req_units': SimpleItem( defvalue=[], typelist=['str'], sequence=1, hidden=1, doc= 'List of units that must complete for this to start (format TRF_ID:UNIT_ID)' ), 'start_time': SimpleItem( defvalue=0, hidden=1, doc='Start time for this unit. Allows a delay to be put in'), 'copy_output': ComponentItem( 'datasets', defvalue=None, load_default=0, optional=1, doc= 'The dataset to copy the output of this unit to, e.g. Grid dataset -> Local Dataset' ), 'merger': ComponentItem('mergers', defvalue=None, load_default=0, optional=1, doc='Merger to be run after this unit completes.'), 'splitter': ComponentItem('splitters', defvalue=None, optional=1, load_default=False, doc='Splitter used on each unit of the Transform.'), 'postprocessors': ComponentItem( 'postprocessor', defvalue=None, doc='list of postprocessors to run after job has finished'), 'inputsandbox': FileItem(defvalue=[], typelist=['str', 'Ganga.GPIDev.Lib.File.File.File'], sequence=1, doc="list of File objects shipped to the worker node "), 'inputfiles': GangaFileItem( defvalue=[], typelist=[ 'str', 'Ganga.GPIDev.Adapters.IGangaFile.IGangaFile' ], sequence=1, doc= "list of file objects that will act as input files for a job"), 'outputfiles': GangaFileItem( defvalue=[], typelist=[ 'str', 'Ganga.GPIDev.Adapters.IGangaFile.IGangaFile' ], sequence=1, doc="list of OutputFile objects to be copied to all jobs"), 'info': SimpleItem(defvalue=[], typelist=['str'], protected=1, sequence=1, doc="Info showing status transitions and unit info"), 'id': SimpleItem(defvalue=-1, protected=1, doc='ID of the Unit', typelist=["int"]), }) _category = 'units' _name = 'IUnit' _exportmethods = [] _hidden = 0 # Special methods: def __init__(self): super(IUnit, self).__init__() self.updateStatus("new") def _readonly(self): """A unit is read-only if the status is not new.""" if self.status == "new": return 0 return 1 def validate(self): """Validate that this unit is OK and set it to active""" self.active = True return True def getID(self): """Get the ID of this unit within the transform""" # if the id isn't already set, use the index from the parent Task if self.id < 0: trf = self._getParent() if not trf: raise ApplicationConfigurationError( None, "This unit has not been associated with a transform and so there is no ID available" ) self.id = trf.units.index(self) return self.id def updateStatus(self, status): """Update status hook""" addInfoString( self, "Status change from '%s' to '%s'" % (self.status, status)) self.status = status def createNewJob(self): """Create any jobs required for this unit""" pass def checkCompleted(self, job): """Check if this unit is complete""" if job.status == "completed": return True else: return False def checkForSubmission(self): """Check if this unit should submit a job""" # check the delay if time.time() < self.start_time: return False # check if we already have a job if len(self.active_job_ids) != 0: return False # if we're using threads, check the max number if self._getParent( ).submit_with_threads and GPI.queues.totalNumUserThreads( ) > self._getParent().max_active_threads: return False return True def checkForResubmission(self): """check if this unit should be resubmitted""" # check if we already have a job if len(self.active_job_ids) == 0: return False else: job = GPI.jobs(self.active_job_ids[0]) if job.status in ["failed", "killed"]: return True return False def checkParentUnitsAreComplete(self): """Check to see if the parent units are complete""" req_ok = True task = self._getParent()._getParent() for req in self.req_units: req_trf_id = int(req.split(":")[0]) if req.find("ALL") == -1: req_unit_id = int(req.split(":")[1]) if task.transforms[req_trf_id].units[ req_unit_id].status != "completed": req_ok = False else: # need all units from this trf for u in task.transforms[req_trf_id].units: if u.status != "completed": req_ok = False return req_ok def checkMajorResubmit(self, job): """check if this job needs to be fully rebrokered or not""" pass def majorResubmit(self, job): """perform a mjor resubmit/rebroker""" self.prev_job_ids.append(job.id) self.active_job_ids.remove(job.id) def minorResubmit(self, job): """perform just a minor resubmit""" try: trf = self._getParent() except Exception as err: logger.debug("GetParent exception!\n%s" % str(err)) trf = None if trf is not None and trf.submit_with_threads: addInfoString(self, "Attempting job re-submission with queues...") GPI.queues.add(job.resubmit) else: addInfoString(self, "Attempting job re-submission...") job.resubmit() def update(self): """Update the unit and (re)submit jobs as required""" #logger.warning("Entered Unit %d update function..." % self.getID()) # if we're complete, then just return if self.status in ["completed", "recreating"] or not self.active: return 0 # check if submission is needed task = self._getParent()._getParent() trf = self._getParent() maxsub = task.n_tosub() # check parent unit(s) req_ok = self.checkParentUnitsAreComplete() # set the start time if not already set if len(self.req_units) > 0 and req_ok and self.start_time == 0: self.start_time = time.time() + trf.chain_delay * 60 - 1 if req_ok and self.checkForSubmission() and maxsub > 0: # create job and submit addInfoString(self, "Creating Job...") j = self.createNewJob() if j.name == '': j.name = "T%i:%i U%i" % (task.id, trf.getID(), self.getID()) try: if trf.submit_with_threads: addInfoString(self, "Attempting job submission with queues...") GPI.queues.add(j.submit) else: addInfoString(self, "Attempting job submission...") j.submit() except Exception as err: logger.debug("update Err: %s" % str(err)) addInfoString(self, "Failed Job Submission") addInfoString(self, "Reason: %s" % (formatTraceback())) logger.error("Couldn't submit the job. Deactivating unit.") self.prev_job_ids.append(j.id) self.active = False trf._setDirty() # ensure everything's saved return 1 self.active_job_ids.append(j.id) self.updateStatus("running") trf._setDirty() # ensure everything's saved if trf.submit_with_threads: return 0 return 1 # update any active jobs for jid in self.active_job_ids: # we have an active job so see if this job is OK and resubmit if # not try: job = GPI.jobs(jid) except Exception as err: logger.debug("Update2 Err: %s" % str(err)) logger.warning( "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue if job.status == "completed": # check if actually completed if not self.checkCompleted(job): return 0 # check for DS copy if trf.unit_copy_output: if not self.copy_output: trf.createUnitCopyOutputDS(self.getID()) if not self.copyOutput(): return 0 # check for merger if trf.unit_merger: if not self.merger: self.merger = trf.createUnitMerger(self.getID()) if not self.merge(): return 0 # all good so mark unit as completed self.updateStatus("completed") elif job.status == "failed" or job.status == "killed": # check for too many resubs if self.minor_resub_count + self.major_resub_count > trf.run_limit - 1: logger.error( "Too many resubmits (%i). Deactivating unit." % (self.minor_resub_count + self.major_resub_count)) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.major_resub_count)) self.active = False return 0 rebroker = False if self.minor_resub_count > trf.minor_run_limit - 1: if self._getParent().rebroker_on_job_fail: rebroker = True else: logger.error( "Too many minor resubmits (%i). Deactivating unit." % self.minor_resub_count) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.minor_resub_count)) self.active = False return 0 if self.major_resub_count > trf.major_run_limit - 1: logger.error( "Too many major resubmits (%i). Deactivating unit." % self.major_resub_count) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.major_resub_count)) self.active = False return 0 # check the type of resubmit if rebroker or self.checkMajorResubmit(job): self.major_resub_count += 1 self.minor_resub_count = 0 try: addInfoString(self, "Attempting major resubmit...") self.majorResubmit(job) except Exception as err: logger.debug("Update Err3: %s" % str(err)) logger.error( "Couldn't resubmit the job. Deactivating unit.") addInfoString(self, "Failed Job resubmission") addInfoString(self, "Reason: %s" % (formatTraceback())) self.active = False # break the loop now because we've probably changed the # active jobs list return 1 else: self.minor_resub_count += 1 try: addInfoString(self, "Attempting minor resubmit...") self.minorResubmit(job) except Exception as err: logger.debug("Update Err4: %s" % str(err)) logger.error( "Couldn't resubmit the job. Deactivating unit.") addInfoString(self, "Failed Job resubmission") addInfoString(self, "Reason: %s" % (formatTraceback())) self.active = False return 1 def reset(self): """Reset the unit completely""" addInfoString(self, "Reseting Unit...") self.minor_resub_count = 0 self.major_resub_count = 0 if len(self.active_job_ids) > 0: self.prev_job_ids += self.active_job_ids self.active_job_ids = [] self.active = True # if has parents, set to recreate if len(self.req_units) > 0: self.updateStatus("recreating") else: self.updateStatus("running") # Info routines def n_active(self): if self.status == 'completed': return 0 tot_active = 0 active_states = ['submitted', 'running'] for jid in self.active_job_ids: try: job = GPI.jobs(jid) except Exception as err: logger.debug("n_active Err: %s" % str(err)) task = self._getParent()._getParent() trf = self._getParent() logger.warning( "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue j = stripProxy(job) # try to preserve lazy loading if hasattr(j, 'getNodeIndexCache') and j.getNodeIndexCache( ) and 'subjobs:status' in j.getNodeIndexCache(): if len(j.getNodeIndexCache()['subjobs:status']) > 0: for sj_stat in j.getNodeIndexCache()['subjobs:status']: if sj_stat in active_states: tot_active += 1 else: if j.getNodeIndexCache()['status'] in active_states: tot_active += 1 else: #logger.warning("WARNING: (active check) No index cache for job object %d" % jid) if j.status in active_states: if j.subjobs: for sj in j.subjobs: if sj.status in active_states: tot_active += 1 else: tot_active += 1 return tot_active def n_status(self, status): tot_active = 0 for jid in self.active_job_ids: try: job = GPI.jobs(jid) except Exception as err: logger.debug("n_status Err: %s" % str(err)) task = self._getParent()._getParent() trf = self._getParent() logger.warning( "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue j = stripProxy(job) # try to preserve lazy loading if hasattr(j, 'getNodeIndexCache') and j.getNodeIndexCache( ) and 'subjobs:status' in j.getNodeIndexCache(): if len(j.getNodeIndexCache()['subjobs:status']) > 0: for sj_stat in j.getNodeIndexCache()['subjobs:status']: if sj_stat == status: tot_active += 1 else: if j.getNodeIndexCache()['status'] == status: tot_active += 1 else: #logger.warning("WARNING: (status check) No index cache for job object %d" % jid) if j.subjobs: for sj in j.subjobs: if sj.status == status: tot_active += 1 else: if j.status == status: tot_active += 1 return tot_active def n_all(self): total = 0 for jid in self.active_job_ids: try: job = GPI.jobs(jid) except Exception as err: logger.debug("n_all Err: %s" % str(err)) task = self._getParent()._getParent() trf = self._getParent() logger.warning( "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue j = stripProxy(job) # try to preserve lazy loading if hasattr(j, 'getNodeIndexCache') and j.getNodeIndexCache( ) and 'subjobs:status' in j.getNodeIndexCache(): if len(j.getNodeIndexCache()['subjobs:status']) != 0: total += len(j.getNodeIndexCache()['subjobs:status']) else: total += 1 else: #logger.warning("WARNING: (status check) No index cache for job object %d" % jid) if j.subjobs: total = len(j.subjobs) else: total = 1 return total def overview(self): """Print an overview of this unit""" o = " Unit %d: %s " % (self.getID(), self.name) for s in ["submitted", "running", "completed", "failed", "unknown"]: o += markup("%i " % self.n_status(s), overview_colours[s]) print(o) def copyOutput(self): """Copy any output to the given dataset""" logger.error( "No default implementation for Copy Output - contact plugin developers" ) return False
class GaudiExec(IPrepareApp): """ Welcome to the new GaudiApp for LHCb apps written/constructed making use of the new CMake framework ============= Requirements: ============= Before submitting jobs with this application you will need to run something similar to the following: (outside of Ganga at the command line) cd $SOMEPATH lb-dev DaVinci v40r2 cd $SOMEPATH/DaVinciDev_v40r2 getpack This program will perform the following command to `prepare` the application before submission: make ganga-input-sandbox NB: The output from this command can be quite large and Ganga will save it to disk and store it at least once per (master) job If your build target is large I would advise that you consider placing your gangadir in your AFS workspace where there is more storage available ====== Usage: ====== This application needs to be configured with the absolute directory of the project and the options you want to pass to gaudirun.py e.g. j=Job() myApp = GaudiExec() myApp.directory = "$SOMEPATH/DaVinciDev_v40r2" myApp.options = ["$SOMEPATH/DaVinciDev_v40r2/myDaVinciOpts.py"] j.application = myApp j.submit() To setup a minimal application you can also run the helper function: prepare_cmake_app(myApp, myVer, myPath, myGetpack) ============= How it works: ============= The actual command run on the WN is:: ./run gaudirun.py optionsFile.py data.py If you would prefer to have your optsfile run as a python application aka like 'GaudiPython' style jobs. Set: job.application.useGaudiRun = False This then changes the command run on the WN to be:: ./run python OptsFileWrapper.py Here the OptsFileWrapper script imports the extraOpts and the data.py describing the data to be run over and executes options in the global namespace with 'execfile' The OptsFileWrapper will _execute_ the first file in the job.application.options and will import all other opts files before executing this one. """ _schema = Schema( Version(1, 0), { # Options created for constructing/submitting this app 'directory': SimpleItem( defvalue='', typelist=[None, str], comparable=1, doc='A path to the project that you\'re wanting to run.'), 'options': GangaFileItem( defvalue=[], sequence=1, doc= 'List of files which contain the options I want to pass to gaudirun.py' ), 'uploadedInput': GangaFileItem( defvalue=None, hidden=1, doc= 'This stores the input for the job which has been pre-uploaded so that it gets to the WN' ), 'jobScriptArchive': GangaFileItem( defvalue=None, hidden=1, copyable=0, doc= 'This file stores the uploaded scripts which are generated fron this app to run on the WN' ), 'useGaudiRun': SimpleItem( defvalue=True, doc= 'Should \'options\' be run as "python options.py data.py" rather than "gaudirun.py options.py data.py"' ), 'platform': SimpleItem(defvalue='x86_64-slc6-gcc49-opt', typelist=[str], doc='Platform the application was built for'), 'extraOpts': SimpleItem( defvalue='', typelist=[str], doc= 'An additional string which is to be added to \'options\' when submitting the job' ), 'extraArgs': SimpleItem( defvalue=[], typelist=[list], sequence=1, doc= 'Extra runtime arguments which are passed to the code running on the WN' ), # Prepared job object 'is_prepared': SimpleItem( defvalue=None, strict_sequence=0, visitable=1, copyable=1, hidden=0, typelist=[None, ShareDir], protected=0, comparable=1, doc= 'Location of shared resources. Presence of this attribute implies the application has been prepared.' ), 'hash': SimpleItem( defvalue=None, typelist=[None, str], hidden=1, doc= 'MD5 hash of the string representation of applications preparable attributes' ), }) _category = 'applications' _name = 'GaudiExec' _exportmethods = ['prepare', 'unprepare', 'execCmd', 'readInputData'] cmake_sandbox_name = 'cmake-input-sandbox.tgz' build_target = 'ganga-input-sandbox' build_dest = 'input-sandbox.tgz' sharedOptsFile_baseName = 'jobScripts-%s.tar' def __setattr__(self, attr, value): """ This overloads the baseclass setter method and allows for dynamic evaluation of a parameter on assignment Args: attr (str): Name of the attribute which is being assigned for this class value (unknown): The raw value which is being passed to this class for assigning to the attribute """ actual_value = value if attr == 'directory': if value: actual_value = path.abspath(fullpath(expandfilename(value))) elif attr == 'options': if isinstance(value, str): new_file = allComponentFilters['gangafiles'](value, None) actual_value = [new_file] elif isinstance(value, IGangaFile): actual_value = [value] elif not isinstance(value, (list, tuple, GangaList, type(None))): logger.warning( "Possibly setting wrong type for options: '%s'" % type(value)) super(GaudiExec, self).__setattr__(attr, actual_value) def unprepare(self, force=False): """ Unprepare the GaudiExec App Args: force (bool): Forces an un-prepare """ logger.debug('Running unprepare in GaudiExec app') if self.is_prepared is not None: self.decrementShareCounter(self.is_prepared.name) self.is_prepared = None self.hash = None self.uploadedInput = None self.jobScriptArchive = None def prepare(self, force=False): """ This method creates a set of prepared files for the application to pass to the RTHandler Args: force (bool): Forces a prepare to be run """ if (self.is_prepared is not None) and not force: raise ApplicationPrepareError( '%s application has already been prepared. Use prepare(force=True) to prepare again.' % getName(self)) # lets use the same criteria as the configure() method for checking file existence & sanity # this will bail us out of prepare if there's somthing odd with the job config - like the executable # file is unspecified, has a space or is a relative path self.configure(self) logger.info('Preparing %s application.' % getName(self)) self.is_prepared = ShareDir() logger.info('Created shared directory: %s' % (self.is_prepared.name)) this_build_target = self.buildGangaTarget() try: # copy any 'preparable' objects into the shared directory send_to_sharedir = self.copyPreparables() # add the newly created shared directory into the metadata system # if the app is associated with a persisted object self.checkPreparedHasParent(self) self.copyIntoPrepDir(this_build_target) all_opts_files = self.getOptsFiles() for opts_file in all_opts_files: if isinstance(opts_file, LocalFile): self.copyIntoPrepDir( path.join(opts_file.localDir, path.basename(opts_file.namePattern))) elif isinstance(opts_file, DiracFile): # NB safe to put it here as should have expressly setup a path for this job by now. # We cannot _not_ place this here based upon the backend. # Always have to put it here regardless of if we're on DIRAC or Local so prepared job can be copied. opts_file.get(localPath=self.getSharedPath()) else: raise ApplicationConfigurationError( None, "Opts file type %s not yet supported please contact Ganga devs if you require this support" % getName(opts_file)) self.post_prepare() except Exception as err: logger.debug("Err: %s" % str(err)) self.unprepare() raise self.cleanGangaTargetArea(this_build_target) return 1 def getExtraOptsFileName(self): """ Returns the name of the opts file which corresponds to the job which owns this app This places the script of interest in a subdir to not overly clutter the WN """ return path.join( 'opts', 'extra_opts_%s_.py' % self.getJobObject().getFQID('.')) def getWrapperScriptName(self): """ Returns the name of the wrapper script file which corresponds to the job which owns this app This places the script of interest in a subdir to not overly clutter the WN """ return path.join( 'wrapper', 'job_%s_optsFileWrapper.py' % self.getJobObject().getFQID('.')) def constructExtraFiles(self, job): """ This constructs or appends to an uncompressed archive containing all of the opts files which are required to run on the grid Args: job (Job): The parent job of this application, we don't care if it's unique or not """ master_job = job.master or job df = master_job.application.jobScriptArchive folder_dir = master_job.getInputWorkspace(create=True).getPath() if not df or df.namePattern == '': unique_name = GaudiExec.sharedOptsFile_baseName % uuid.uuid4() master_job.application.jobScriptArchive = LocalFile( namePattern=unique_name, localDir=folder_dir) tar_filename = path.join(folder_dir, unique_name) if not path.isfile(tar_filename): with tarfile.open(tar_filename, "w"): pass with tarfile.open(tar_filename, "a") as tar_file: tinfo = tarfile.TarInfo('__timestamp__') tinfo.mtime = time.time() fileobj = StringIO(getTimestampContent()) tinfo.size = fileobj.len tar_file.addfile(tinfo, fileobj) else: unique_name = master_job.application.jobScriptArchive.namePattern extra_opts_file = self.getExtraOptsFileName() # First construct if needed if not path.isfile(path.join(folder_dir, unique_name)): with tarfile.open(path.join(folder_dir, unique_name), "w"): pass # Now append the extra_opts file here when needed with tarfile.open(path.join(folder_dir, unique_name), "a") as tar_file: # Add the extra opts file to the job tinfo = tarfile.TarInfo(extra_opts_file) tinfo.mtime = time.time() fileobj = StringIO(self.extraOpts) tinfo.size = fileobj.len tar_file.addfile(tinfo, fileobj) if not self.useGaudiRun: # Add the WN script for wrapping the job logger.info("Constructing: %s" % self.getWrapperScriptName()) tinfo2 = tarfile.TarInfo(self.getWrapperScriptName()) tinfo2.mtime = time.time() fileobj2 = StringIO(self.getWNPythonContents()) tinfo2.size = fileobj2.len tar_file.addfile(tinfo2, fileobj2) def cleanGangaTargetArea(self, this_build_target): """ Method to remove the build target and other files not needed to reproduce the same build target again Args: this_build_target (str): This is the full path of the build target """ logger.debug("Cleaning up area after prepare") # Don't delete these preserved_set = set(['run']) build_dir = path.dirname(this_build_target) for obj in set(listdir(build_dir)) - preserved_set: logger.debug("del: %s of %s" % (obj, set(listdir(build_dir)) - preserved_set)) if path.isfile(path.join(build_dir, obj)): unlink(path.join(build_dir, obj)) elif path.isdir(path.join(build_dir, obj)): shutil.rmtree(path.join(build_dir, obj), ignore_errors=True) def configure(self, masterappconfig): """ Required even though nothing is done in this step for this App Args: masterappconfig (unknown): This is the output from the master_configure from the parent app """ # Lets test the inputs opt_file = self.getOptsFiles() dir_name = self.directory return (None, None) def getOptsFiles(self): """ This function returns a sanitized absolute path to the self.options file from user input """ if self.options: for this_opt in self.options: if isinstance(this_opt, LocalFile): ## FIXME LocalFile should return the basename and folder in 2 attibutes so we can piece it together, now it doesn't full_path = path.join(this_opt.localDir, this_opt.namePattern) if not path.exists(full_path): raise ApplicationConfigurationError( None, "Opts File: \'%s\' has been specified but does not exist please check and try again!" % full_path) elif isinstance(this_opt, DiracFile): pass else: logger.error("opts: %s" % self.options) raise ApplicationConfigurationError( None, "Opts file type %s not yet supported please contact Ganga devs if you require this support" % getName(this_opt)) return self.options else: raise ApplicationConfigurationError( None, "No Opts File has been specified, please provide one!") def getEnvScript(self): """ Return the script which wraps the running command in a correct environment """ return 'export CMTCONFIG=%s; source LbLogin.sh --cmtconfig=%s && ' % ( self.platform, self.platform) def execCmd(self, cmd): """ This method executes a command within the namespace of the project. The cmd is placed in a bash script which is executed within the env This will adopt the platform associated with this application. Any explicit calls to be run within the project env have to be prepended with './run '. This is not added automatically e.g. The following will execute a 'make' command within the given project dir app = GaudiExec('some/path') app.execCmd('make') Args: cmd (str): This is the command(s) which are to be executed within the project environment and directory """ cmd_file = tempfile.NamedTemporaryFile(suffix='.sh', delete=False) cmd_file.write("#!/bin/bash") cmd_file.write("\n") cmd_file.write(self.getEnvScript()) cmd_file.write(cmd) cmd_file.flush() cmd_file.close() st = os_stat(cmd_file.name) chmod(cmd_file.name, st.st_mode | stat.S_IEXEC) logger.debug("Running: %s" % cmd_file.name) # I would have preferred to execute all commands against inside `./run` so we have some sane behaviour # but this requires a build to have been run before we can use this command reliably... so we're just going to be explicit rc, stdout, stderr = _exec_cmd(cmd_file.name, self.directory) if rc != 0: logger.error("Failed to execute command: %s" % cmd_file.name) logger.error("Tried to execute command in: %s" % self.directory) logger.error("StdErr: %s" % str(stderr)) raise GangaException("Failed to Execute command") unlink(cmd_file.name) return rc, stdout, stderr @gaudiExecBuildLock def buildGangaTarget(self): """ This builds the ganga target 'ganga-input-sandbox' for the project defined by self.directory This returns the absolute path to the file after it has been created. It will fail if things go wrong or the file fails to generate """ logger.info( "Make-ing target '%s' (This may take a few minutes depending on the size of your project)" % GaudiExec.build_target) # Up to the user to run something like make clean... (Although that would avoid some potential CMake problems) self.execCmd('make %s' % GaudiExec.build_target) targetPath = path.join(self.directory, 'build.%s' % self.platform, 'ganga') if not path.isdir(targetPath): raise GangaException("Target Path: %s NOT found!" % targetPath) sandbox_str = '%s' % GaudiExec.build_dest targetFile = path.join(targetPath, sandbox_str) if not path.isfile(targetFile): raise GangaException("Target File: %s NOT found!" % targetFile) wantedTargetFile = path.join(targetPath, GaudiExec.cmake_sandbox_name) rename(targetFile, wantedTargetFile) if not path.isfile(wantedTargetFile): raise GangaException("Wanted Target File: %s NOT found" % wantedTargetFile) logger.info("Built %s" % wantedTargetFile) return wantedTargetFile def readInputData(self, opts): """ This reads the inputdata from a file and assigns it to the inputdata field of the parent job. Or you can use BKQuery and the box repo to save having to do this over and over Args: opts (str): This is the file which contains the inputdata we want to read in """ input_dataset = getGaudiExecInputData(opts, self) try: job = self.getJobObject() except: raise GangaException( "This makes no sense without first belonging to a job object as I can't assign input data!" ) if job.inputdata is not None and len(job.inputdata) > 0: logger.warning( "Warning Job %s already contained inputdata, overwriting" % job.fqid) job.inputdata = input_dataset def getWNPythonContents(self): """ Return the wrapper script which is used to run GaudiPython type jobs on the WN """ # FIXME should there be a more central definition of 'data.py' string to rename this for all of LHCb if it ever changes for LHCbDirac from ..RTHandlers.GaudiExecRTHandlers import GaudiExecDiracRTHandler all_names = [this_o.namePattern for this_o in self.options] return gaudiPythonWrapper(repr(self.extraArgs), self.getExtraOptsFileName(), GaudiExecDiracRTHandler.data_file, all_names)
class OstapRun(GaudiExec): """The main application to run ``Ostap'' User needs to supply: - the scripts to be executed - ostap interactive commands to be executed (optional) - other arguments for script ``ostap'' (optional) The actual command to be executed is: > ostap [ scripts [scripts [scripts ... [ arguments ] --no-color --batch [ --command [ commands [ commands [ commands ====== Usage: ====== j.application = OstapRun ( scripts = ['path_to_script/the_script.py'] , arguments = [ '--no-canvas' ] , commands = [ 'print dir()' ] ) ) """ _schema = GaudiExec._schema.inherit_copy() _schema.version.major += 0 _schema.version.minor += 0 ## make entries for key, val in _schema.datadict.iteritems(): if key == 'useGaudiRun': val._update({'defvalue': False}) if not key in ('platform', 'directory'): if not val['hidden']: val._update({'hidden': 1}) ## add new entries _schema.datadict['scripts'] = GangaFileItem( optional=0, sequence=1, strict_sequence=0, doc= """The names of ostap script files to be executed. The files are executed within ``ostap'' context. A copy will be made at submission time""" ) _schema.datadict['commands'] = SimpleItem( defvalue=[], typelist=['str'], sequence=1, strict_sequence=0, doc="""The ostap commands to be executed, e.g. [ 'print dir()' ]""") _schema.datadict['arguments'] = SimpleItem( defvalue=[], typelist=['str'], sequence=1, strict_sequence=0, doc= "The list of command-line arguments for ``ostap'' script, e.g. ['-w','-p5'], etc. Following arguments are appended automatically: --no-color and --batch" "") ## _category = 'applications' _name = 'OstapRun' _exportmethods = ['prepare', 'unprepare'] # ========================================================================= def configure(self, masterjobconfig): self.options = [f for f in self.scripts] return (None, None) # ========================================================================= def getWNPythonContents(self): """Return the wrapper script which is used to run Ostap on the WN """ data_file = GaudiExecDiracRTHandler.data_file return _script_ostap_.format(scripts=[ os.path.basename(os.path.join(f.localDir, f.namePattern)) for f in self.scripts ], arguments=self.arguments, command=self.commands)
class BenderRun(GaudiExec): """The main application to run ``BenderScript'' User needs to supply: - the scripts to be executed - the configuration scripts (aka ``options'') to be imported (optional) - bender interactive commands to be executed (optional) - other arguments for script ``bender'' (optional) The actual command to be executed is: > bender [ scripts [ scripts ... --no-color [ arguments ] --import [ imports [ imports [ ... --no-castor --import=data.py --batch [ --command [ commands [ ... ====== Usage: ====== j.application = BenderRun ( scripts = [ 'the_path/the_module.py' ] , imports = [ 'some_miport_file.py' ] , commands = [ 'ls()' , 'run(10)' , 'ls()' ] , arguments = [ ... ] , directory = ... ) """ _schema = GaudiExec._schema.inherit_copy() _schema.version.major += 0 _schema.version.minor += 0 ## make entries for key, val in _schema.datadict.iteritems(): if key == 'useGaudiRun': val._update({'defvalue': False}) if not key in ('platform', 'directory'): if not val['hidden']: val._update({'hidden': 1}) ## add new entries _schema.datadict['scripts'] = GangaFileItem( optional=0, sequence=1, strict_sequence=0, doc= """The names of the script files to execute. A copy will be made at submission time. The script are executed within ``bender'' context""" ) _schema.datadict['imports'] = GangaFileItem( defvalue=[], sequence=1, strict_sequence=0, doc= """The names of the configurtaion scripts (ana ``options'') to be imported via ``importOptions''. A copy will be made at submission time""" ) _schema.datadict['commands'] = SimpleItem( defvalue=[], typelist=['str'], sequence=1, strict_sequence=0, doc= """The bender commands to be executed, e.g. [ 'run(10)' , 'print ls()' , 'print dir()' ]""" ) _schema.datadict['arguments'] = SimpleItem( defvalue=[], typelist=['str'], sequence=1, strict_sequence=0, doc= """The list of command-line arguments for bender script, e.g. ['-w','-p5'], etc. Following arguments will be appended automatically: --no-color, --no-castor and --batch.""" ) ## _category = 'applications' _name = 'BenderRun' _exportmethods = ['prepare', 'unprepare'] # ========================================================================= def configure(self, masterjobconfig): self.options = [f for f in self.scripts] + [f for f in self.imports] return (None, None) # ========================================================================= def getWNPythonContents(self): """Return the wrapper script which is used to run BenderScript on the WN """ data_file = GaudiExecDiracRTHandler.data_file return _script_bender_.format( scripts=[ os.path.basename(os.path.join(f.localDir, f.namePattern)) for f in self.scripts ], arguments=self.arguments, imports=[ os.path.basename(os.path.join(f.localDir, f.namePattern)) for f in self.imports ], datafile=data_file, command=self.commands)
class BenderModule(GaudiExec): """ The main application to run ``classic'' Bender (module with the proper ``configure'' and ``run'' methods) User needs to supply: - the name of Bender module to run - dictionary of parameters to be forwarder to <code>configure</code> method - number of event to process ====== Usage: ====== j.application = BenderModule ( module = 'the_path/the_module.py' , events = 1000 , params = {...} , directory = ... ) """ ## _schema = GaudiExec._schema.inherit_copy() _schema.version.major += 0 _schema.version.minor += 0 ## make entries for key, val in _schema.datadict.iteritems(): if key == 'useGaudiRun': val._update({'defvalue': False}) if not key in ('platform', 'directory'): if not val['hidden']: val._update({'hidden': 1}) ## add new entries _schema.datadict['module'] = GangaFileItem( optional=0, doc= """The file with Bender module. It is expected that module contains the methods ``configure'' & ``run'' with the proper signatures""" ) _schema.datadict['params'] = SimpleItem( defvalue={}, typelist=['dict', 'str', 'int', 'bool', 'float'], doc= """The dictionary of parameters to be forwarded to ``configure'' method of the supplied Bender module""" ) _schema.datadict['events'] = SimpleItem(defvalue=-1, typelist=['int'], doc="Number of events to process") ## _category = 'applications' _name = 'BenderModule' _exportmethods = ['prepare', 'unprepare'] # ========================================================================= def configure(self, masterjobconfig): self.options = [self.module] return (None, None) # ========================================================================= def getWNPythonContents(self): """Return the wrapper script which is used to run Bender on the WN """ f = self.module file_name = os.path.basename(os.path.join(f.localDir, f.namePattern)) module_name = file_name.split('.')[0] param_string = ',params=%s' % self.params if self.params else '' data_file = GaudiExecDiracRTHandler.data_file return _script_.format(datafile=data_file, modulename=module_name, paramstring=param_string, events=self.events)
class Im3ShapeApp(IPrepareApp): """ This App is to store the configuration of the Im3Shape app which is to be run according to a given set of configs: i.e. ./run_dir/exe_name <someData> ini_location catalog <someOutput> rank size e.g. ./run_dir/run-im3shape someData.fz ini_file.ini all someData.fz.0.20 0 20 The input and output file names are configured in the RTHandler based upon the inputdata given to a particular job. The im3_location is the path to a .tgz file (or some file which can be extracted by the RTHandler) which gives the im3shape-grid (run_dir) folder containing the run-im3shape app (exe-name) on the WN """ _schema = Schema( Version(1, 0), { ## Required to configure Im3ShapeApp 'im3_location': GangaFileItem(defvalue=None, doc="Location of the Im3Shape program tarball"), 'exe_name': SimpleItem(defvalue='run-im3shape', doc="Name of the im3shape binary"), 'ini_location': GangaFileItem(defvalue=None, doc=".ini file used to configure Im3Shape"), 'blacklist': GangaFileItem(defvalue=None, doc="Blacklist file for running Im3Shape"), 'rank': SimpleItem(defvalue=0, doc="Rank in the split of the tile from splitting"), 'size': SimpleItem(defvalue=200, doc="Size of the splitting of the tile from splitting"), 'catalog': SimpleItem( defvalue='all', types=[str], doc="Catalog which is used to describe what is processed"), 'run_dir': SimpleItem(defvalue='im3shape-grid', types=[str], doc="Directory on the WN where the binary is"), ## Below is needed for prepared state stuff 'is_prepared': SimpleItem( defvalue=None, strict_sequence=0, visitable=1, copyable=1, hidden=0, typelist=[None, ShareDir], protected=0, comparable=1, doc= 'Location of shared resources. Presence of this attribute implies the application has been prepared.' ), 'hash': SimpleItem( defvalue=None, typelist=[None, str], hidden=0, doc= 'MD5 hash of the string representation of applications preparable attributes' ), }) _category = 'applications' _name = 'Im3ShapeApp' _exportmethods = ['prepare', 'unprepare'] def unprepare(self, force=False): """ Revert an Im3ShapeApp application back to it's unprepared state. args: force (bool): should force the unprepare step to run """ logger.debug('Running unprepare in Im3ShapeApp') if self.is_prepared is not None: self.decrementShareCounter(self.is_prepared.name) self.is_prepared = None self.hash = None def prepare(self, force=False): """ This prepares the Im3ShapeApp application and copies any LocalFile objects which are allocated to: im3_location, ini_location and blacklist into the prepared sandbox to be shipped to the WN Args: force (bool): Should force the prepare step to run """ if (self.is_prepared is not None) and (force is not True): raise ApplicationPrepareError( '%s application has already been prepared. Use prepare(force=True) to prepare again.' % getName(self)) logger.info('Preparing %s application.' % getName(self)) self.is_prepared = ShareDir() logger.info('Created shared directory: %s' % (self.is_prepared.name)) try: # copy any 'preparable' objects into the shared directory send_to_sharedir = self.copyPreparables() # add the newly created shared directory into the metadata system # if the app is associated with a persisted object self.checkPreparedHasParent(self) for file_ in [ self.ini_location, self.im3_location, self.blacklist ]: if isinstance(file_, LocalFile): self.copyIntoPrepDir(file_.namePattern) assert type(file_) in [LocalFile, DiracFile] # return # [os.path.join(self.is_prepared.name,os.path.basename(send_to_sharedir))] self.post_prepare() except Exception as err: logger.debug("Err: %s" % str(err)) self.unprepare() raise return 1 def configure(self, masterappconfig): """ This is a null-op effecitvely, we may add something here in the future but this function is stub This is required so that the job will submit """ return (None, None)
class LHCbDataset(GangaDataset): '''Class for handling LHCb data sets (i.e. inputdata for LHCb jobs). Example Usage: ds = LHCbDataset(["lfn:/some/lfn.file","pfn:/some/pfn.file"]) ds[0] # DiracFile("/some/lfn.file") - see DiracFile docs for usage ds[1] # PhysicalFile("/some/pfn.file")- see PhysicalFile docs for usage len(ds) # 2 (number of files) ds.getReplicas() # returns replicas for *all* files in the data set ds.replicate("CERN-USER") # replicate *all* LFNs to "CERN-USER" SE ds.getCatalog() # returns XML catalog slice ds.optionsString() # returns Gaudi-sytle options [...etc...] ''' schema = {} docstr = 'List of PhysicalFile and DiracFile objects' schema['files'] = GangaFileItem(defvalue=[], typelist=['str', 'Ganga.GPIDev.Adapters.IGangaFile.IGangaFile'], sequence=1, doc=docstr) docstr = 'Ancestor depth to be queried from the Bookkeeping' schema['depth'] = SimpleItem(defvalue=0, doc=docstr) docstr = 'Use contents of file rather than generating catalog.' schema['XMLCatalogueSlice'] = GangaFileItem(defvalue=None, doc=docstr) docstr = 'Specify the dataset persistency technology' schema['persistency'] = SimpleItem( defvalue=None, typelist=['str', 'type(None)'], doc=docstr) schema['treat_as_inputfiles'] = SimpleItem(defvalue=False, doc="Treat the inputdata as inputfiles, i.e. copy the inputdata to the WN") _schema = Schema(Version(3, 0), schema) _category = 'datasets' _name = "LHCbDataset" _exportmethods = ['getReplicas', '__len__', '__getitem__', 'replicate', 'hasLFNs', 'append', 'extend', 'getCatalog', 'optionsString', 'getLFNs', 'getFileNames', 'getFullFileNames', 'difference', 'isSubset', 'isSuperset', 'intersection', 'symmetricDifference', 'union', 'bkMetadata', 'isEmpty', 'hasPFNs', 'getPFNs'] # ,'pop'] def __init__(self, files=None, persistency=None, depth=0): super(LHCbDataset, self).__init__() if files is None: files = [] new_files = GangaList() if isType(files, LHCbDataset): for this_file in files: new_files.append(deepcopy(this_file)) elif isType(files, IGangaFile): new_files.append(deepcopy(this_file)) elif isType(files, (list, tuple, GangaList)): new_list = [] for this_file in files: if type(this_file) is str: new_file = string_datafile_shortcut_lhcb(this_file, None) elif isType(this_file, IGangaFile): new_file = this_file else: new_file = strToDataFile(this_file) new_list.append(stripProxy(new_file)) stripProxy(new_files)._list = new_list elif type(files) is str: new_files.append(string_datafile_shortcut_lhcb(this_file, None), False) else: raise GangaException("Unknown object passed to LHCbDataset constructor!") new_files._setParent(self) logger.debug("Processed inputs, assigning files") # Feel free to turn this on again for debugging but it's potentially quite expensive #logger.debug( "Creating dataset with:\n%s" % files ) self.files = new_files logger.debug("Assigned files") self.persistency = persistency self.depth = depth logger.debug("Dataset Created") #def __deepcopy__(self, memo): #stripProxy(self)._getReadAccess() #cls = type(stripProxy(self)) #obj = super(cls, cls).__new__(cls) #this_dict = stripProxy(self).__getstate__() #for n in this_dict.keys(): # this_dict[n] = deepcopy(this_dict[n], memo) # #if n == 'files': # # for this_file in this_dict['files']: # # stripProxy(this_file)._setParent(obj) #obj.__setstate__(this_dict) #return obj def __construct__(self, args): logger.debug("__construct__") self.files = [] if (len(args) != 1): super(LHCbDataset, self).__construct__(args[1:]) #logger.debug("__construct__: %s" % str(args)) if len(args) == 0: return self.files = [] if type(args[0]) is str: this_file = string_datafile_shortcut_lhcb(args[0], None) self.files.append(args[0]) else: for file_arg in args[0]: if type(file_arg) is str: this_file = string_datafile_shortcut_lhcb(file_arg, None) else: this_file = file_arg self.files.append(file_arg) # Equally as expensive #logger.debug( "Constructing dataset len: %s\n%s" % (str(len(self.files)), str(self.files) ) ) logger.debug("Constructing dataset len: %s" % str(len(self.files))) def __len__(self): """The number of files in the dataset.""" result = 0 if self.files: result = len(self.files) return result def __nonzero__(self): """This is always True, as with an object.""" return True def __getitem__(self, i): '''Proivdes scripting (e.g. ds[2] returns the 3rd file) ''' #this_file = self.files[i] # print type(this_file) # return this_file # return GPIProxyObjectFactory(this_file) # return this_file if type(i) == type(slice(0)): ds = LHCbDataset(files=self.files[i]) ds.depth = self.depth #ds.XMLCatalogueSlice = self.XMLCatalogueSlice return ds else: return self.files[i] def isEmpty(self): return not bool(self.files) def getReplicas(self): 'Returns the replicas for all files in the dataset.' lfns = self.getLFNs() cmd = 'getReplicas(%s)' % str(lfns) result = get_result(cmd, 'LFC query error', 'Could not get replicas.') return result['Value']['Successful'] def hasLFNs(self): 'Returns True is the dataset has LFNs and False otherwise.' for f in self.files: if isDiracFile(f): return True return False def hasPFNs(self): 'Returns True is the dataset has PFNs and False otherwise.' for f in self.files: if not isDiracFile(f): return True return False def replicate(self, destSE=''): '''Replicate all LFNs to destSE. For a list of valid SE\'s, type ds.replicate().''' if not destSE: from GangaDirac.Lib.Files.DiracFile import DiracFile DiracFile().replicate('') return if not self.hasLFNs(): raise GangaException('Cannot replicate dataset w/ no LFNs.') retry_files = [] for f in self.files: if not isDiracFile(f): continue try: result = f.replicate( destSE=destSE ) except Exception as err: msg = 'Replication error for file %s (will retry in a bit).' % f.lfn logger.warning(msg) logger.warning("Error: %s" % str(err)) retry_files.append(f) for f in retry_files: try: result = f.replicate( destSE=destSE ) except Exception as err: msg = '2nd replication attempt failed for file %s. (will not retry)' % f.lfn logger.warning(msg) logger.warning(str(err)) def append(self, input_file): self.extend([input_file]) def extend(self, files, unique=False): '''Extend the dataset. If unique, then only add files which are not already in the dataset.''' from Ganga.GPIDev.Base import ReadOnlyObjectError if self._parent is not None and self._parent._readonly(): raise ReadOnlyObjectError('object Job#%s is read-only and attribute "%s/inputdata" cannot be modified now' % (self._parent.id, getName(self))) _external_files = [] if type(files) is str or isType(files, IGangaFile): _external_files = [files] elif type(files) in [list, tuple]: _external_files = files elif isType(files, LHCbDataset): _external_files = files.files else: if not hasattr(files, "__getitem__") or not hasattr(files, '__iter__'): _external_files = [files] # just in case they extend w/ self _to_remove = [] for this_file in _external_files: if hasattr(this_file, 'subfiles'): if len(this_file.subfiles) > 0: _external_files = makeGangaListByRef(this_file.subfiles) _to_remove.append(this_file) if type(this_file) is str: _external_files.append(string_datafile_shortcut_lhcb(this_file, None)) _to_remove.append(this_file) for _this_file in _to_remove: _external_files.pop(_external_files.index(_this_file)) for this_f in _external_files: _file = getDataFile(this_f) if _file is None: _file = this_f myName = _file.namePattern from GangaDirac.Lib.Files.DiracFile import DiracFile if isType(_file, DiracFile): myName = _file.lfn if unique and myName in self.getFileNames(): continue self.files.append(stripProxy(_file)) def removeFile(self, input_file): try: self.files.remove(input_file) except: raise GangaException('Dataset has no file named %s' % input_file.namePattern) def getLFNs(self): 'Returns a list of all LFNs (by name) stored in the dataset.' lfns = [] if not self: return lfns for f in self.files: if isDiracFile(f): subfiles = f.getSubFiles() if len(subfiles) == 0: lfns.append(f.lfn) else: for file in subfiles: lfns.append(file.lfn) #logger.debug( "Returning LFNS:\n%s" % str(lfns) ) logger.debug("Returning #%s LFNS" % str(len(lfns))) return lfns def getPFNs(self): 'Returns a list of all PFNs (by name) stored in the dataset.' pfns = [] if not self: return pfns for f in self.files: if isPFN(f): pfns.append(f.namePattern) return pfns def getFileNames(self): 'Returns a list of the names of all files stored in the dataset.' names = [] from GangaDirac.Lib.Files.DiracFile import DiracFile for i in self.files: if isType(i, DiracFile): names.append(i.lfn) else: try: names.append(i.namePattern) except: logger.warning("Cannot determine filename for: %s " % i) raise GangaException("Cannot Get File Name") return names def getFullFileNames(self): 'Returns all file names w/ PFN or LFN prepended.' names = [] from GangaDirac.Lib.Files.DiracFile import DiracFile for f in self.files: if isType(f, DiracFile): names.append('LFN:%s' % f.lfn) else: try: names.append('PFN:%s' % f.namePattern) except: logger.warning("Cannot determine filename for: %s " % f) raise GangaException("Cannot Get File Name") return names def getCatalog(self, site=''): '''Generates an XML catalog from the dataset (returns the XML string). Note: site defaults to config.LHCb.LocalSite Note: If the XMLCatalogueSlice attribute is set, then it returns what is written there.''' if hasattr(self.XMLCatalogueSlice, 'name'): if self.XMLCatalogueSlice.name: f = open(self.XMLCatalogueSlice.name) xml_catalog = f.read() f.close() return xml_catalog if not site: site = getConfig('LHCb')['LocalSite'] lfns = self.getLFNs() depth = self.depth tmp_xml = tempfile.NamedTemporaryFile(suffix='.xml') cmd = 'getLHCbInputDataCatalog(%s,%d,"%s","%s")' \ % (str(lfns), depth, site, tmp_xml.name) result = get_result(cmd, 'LFN->PFN error', 'XML catalog error.') xml_catalog = tmp_xml.read() tmp_xml.close() return xml_catalog def optionsString(self, file=None): 'Returns the Gaudi-style options string for the dataset (if a filename' \ ' is given, the file is created and output is written there).' if not self or len(self) == 0: return '' snew = '' if self.persistency == 'ROOT': snew = '\n#new method\nfrom GaudiConf import IOExtension\nIOExtension(\"%s\").inputFiles([' % self.persistency elif self.persistency == 'POOL': snew = '\ntry:\n #new method\n from GaudiConf import IOExtension\n IOExtension(\"%s\").inputFiles([' % self.persistency elif self.persistency == None: snew = '\ntry:\n #new method\n from GaudiConf import IOExtension\n IOExtension().inputFiles([' else: logger.warning( "Unknown LHCbDataset persistency technology... reverting to None") snew = '\ntry:\n #new method\n from GaudiConf import IOExtension\n IOExtension().inputFiles([' sold = '\nexcept ImportError:\n #Use previous method\n from Gaudi.Configuration import EventSelector\n EventSelector().Input=[' sdatasetsnew = '' sdatasetsold = '' dtype_str_default = getConfig('LHCb')['datatype_string_default'] dtype_str_patterns = getConfig('LHCb')['datatype_string_patterns'] for f in self.files: dtype_str = dtype_str_default for this_str in dtype_str_patterns: matched = False for pat in dtype_str_patterns[this_str]: if fnmatch.fnmatch(f.namePattern, pat): dtype_str = this_str matched = True break if matched: break sdatasetsnew += '\n ' sdatasetsold += '\n ' if isDiracFile(f): sdatasetsnew += """ \"LFN:%s\",""" % f.lfn sdatasetsold += """ \"DATAFILE='LFN:%s' %s\",""" % (f.lfn, dtype_str) else: sdatasetsnew += """ \"PFN:%s\",""" % f.namePattern sdatasetsold += """ \"DATAFILE='PFN:%s' %s\",""" % (f.namePattern, dtype_str) if sdatasetsold.endswith(","): if self.persistency == 'ROOT': sdatasetsnew = sdatasetsnew[:-1] + """\n], clear=True)""" else: sdatasetsnew = sdatasetsnew[:-1] + """\n ], clear=True)""" sdatasetsold = sdatasetsold[:-1] sdatasetsold += """\n ]""" if(file): f = open(file, 'w') if self.persistency == 'ROOT': f.write(snew) f.write(sdatasetsnew) else: f.write(snew) f.write(sdatasetsnew) f.write(sold) f.write(sdatasetsold) f.close() else: if self.persistency == 'ROOT': return snew + sdatasetsnew else: return snew + sdatasetsnew + sold + sdatasetsold def _checkOtherFiles(self, other ): if isType(other, GangaList) or isType(other, []): other_files = LHCbDataset(other).getFullFileNames() elif isType(other, LHCbDataset): other_files = other.getFullFileNames() else: raise GangaException("Unknown type for difference") return other_files def difference(self, other): '''Returns a new data set w/ files in this that are not in other.''' other_files = self._checkOtherFiles(other) files = set(self.getFullFileNames()).difference(other_files) data = LHCbDataset() data.__construct__([list(files)]) data.depth = self.depth return GPIProxyObjectFactory(data) def isSubset(self, other): '''Is every file in this data set in other?''' other_files = self._checkOtherFiles(other) return set(self.getFileNames()).issubset(other_files) def isSuperset(self, other): '''Is every file in other in this data set?''' other_files = self._checkOtherFiles(other) return set(self.getFileNames()).issuperset(other_files) def symmetricDifference(self, other): '''Returns a new data set w/ files in either this or other but not both.''' other_files = other.checkOtherFiles(other) files = set(self.getFullFileNames()).symmetric_difference(other_files) data = LHCbDataset() data.__construct__([list(files)]) data.depth = self.depth return GPIProxyObjectFactory(data) def intersection(self, other): '''Returns a new data set w/ files common to this and other.''' other_files = other._checkOtherFiles(other) files = set(self.getFullFileNames()).intersection(other_files) data = LHCbDataset() data.__construct__([list(files)]) data.depth = self.depth return GPIProxyObjectFactory(data) def union(self, other): '''Returns a new data set w/ files from this and other.''' other_files = self._checkOtherFiles(other) files = set(self.getFullFileNames()).union(other_files) data = LHCbDataset() data.__construct__([list(files)]) data.depth = self.depth return GPIProxyObjectFactory(data) def bkMetadata(self): 'Returns the bookkeeping metadata for all LFNs. ' logger.info("Using BKQuery(bkpath).getDatasetMetadata() with bkpath=the bookkeeping path, will yeild more metadata such as 'TCK' info...") cmd = 'bkMetaData(%s)' % self.getLFNs() b = get_result(cmd, 'Error removing replica', 'Replica rm error.') return b