def setUp(self): self.dd = { 'application': ComponentItem(category='applications'), 'backend': ComponentItem(category='backends'), 'name': SimpleItem('', comparable=0), 'workdir': SimpleItem(defvalue=None, type='string', transient=1, protected=1, comparable=0), 'status': SimpleItem(defvalue='new', protected=1, comparable=0), 'id': SimpleItem(defvalue=None, typelist=[str], protected=1, comparable=0), 'inputbox': FileItem(defvalue=[], sequence=1), 'outputbox': FileItem(defvalue=[], sequence=1), 'overriden_copyable': SimpleItem(defvalue=None, protected=1, copyable=1), 'plain_copyable': SimpleItem(defvalue=None, copyable=0) } self.s = Schema(Version(1, 0), self.dd)
class CustomMerger(IMerger): """User tool for writing custom merging tools with Python Allows a script to be supplied that performs the merge of some custom file type. The script must be a python file which defines the following function: def merge(file_list, output_file): #perform the merge if not success: return -1 else: return 0 This module will be imported and used by the CustomMerger. The file_list is a list of paths to the files to be merged. output_file is a string path for the output of the merge. This file must exist by the end of the merge or the merge will fail. If the merge cannot proceed, then the function should return a non-zero integer. Clearly this tool is provided for advanced ganga usage only, and should be used with this in mind. """ _category = 'postprocessor' _name = 'CustomMerger' _schema = IMerger._schema.inherit_copy() _schema.datadict['module'] = FileItem( defvalue=None, doc='Path to a python module to perform the merge.') def mergefiles(self, file_list, output_file): import os if not os.path.exists(self.module.name): raise PostProcessException( "The module '&s' does not exist and so merging will fail.", self.module.name) result = False try: ns = { 'file_list': copy.copy(file_list), 'output_file': copy.copy(output_file) } execfile(self.module.name, ns) exec('_result = mergefiles(file_list,output_file)', ns) result = ns.get('_result', result) except Exception as e: raise PostProcessException( 'There was a problem executing the custom merge: %s. Merge will fail.' % e) if result is not True: raise PostProcessException( 'The custom merge did not return True, merge will fail.') return self.success
class CustomChecker(IChecker): """User tool for writing custom check with Python. Make a file, e.g customcheck.py, In that file, do something like: def check(j): if j has passed: return True else: return False When the job is about to be completed, Ganga will call this function and fail the job if False is returned. """ _category = 'postprocessor' _name = 'CustomChecker' _schema = IChecker._schema.inherit_copy() _schema.datadict['module'] = FileItem( defvalue=None, doc='Path to a python module to perform the check.') _exportmethods = ['check'] def check(self, job): if (self.module is None) or not self.module: raise PostProcessException( "No module is specified and so the check will fail.") if (self.module.name is None) or not os.path.isfile(self.module.name): raise PostProcessException( "The module '%s' does not exist and so CustomChecker will do nothing!" % (self.module.name)) result = None try: ns = {'job': job} execfile(self.module.name, ns) exec('_result = check(job)', ns) result = ns.get('_result', result) except Exception as e: raise PostProcessException( 'There was a problem with executing the module: %s, CustomChecker will do nothing!' % e) if result is not True and result is not False: raise PostProcessException( 'The custom check module did not return True or False, CustomChecker will do nothing!' ) if result is not True: logger.info('The custom check module returned False for job(%s)', job.fqid) return self.failure return self.success
class DSTMerger(AbstractMerger): """A merger object for DST files The merger uses DaVinci to combine DST files that have been returned *locally* in a job's outputsandbox. As such it is mainly useful for microDST files. The usage is as with other merger objects. See the help for TextMerger or RootMerger for more details. Example: dm = DSTMerger() dm.files = ['dv.dst'] This object can be attached to a job object or used to merge a list of jobs with its merge method. It is possible to overide the default opts file for performing the merge. A new opts file can be provided via the 'merge_opts' field. This should be done with care, as some opts are assumed when writing the files for output. """ _category = 'mergers' _exportmethods = ['merge'] _name = 'DSTMerger' _schema = AbstractMerger._schema.inherit_copy() docstr = 'Path to a options file to use when merging.' _schema.datadict['merge_opts'] = FileItem(defvalue=None, doc=docstr) docstr = 'The version of DaVinci to use when merging. (e.g. v19r14)' _schema.datadict['version'] = SimpleItem(defvalue='', doc=docstr) def __init__(self): super(DSTMerger,self).__init__(_DSTMergeTool()) def merge(self, jobs, outputdir=None, ignorefailed=None, overwrite=None): self.merge_tool.merge_opts = self.merge_opts self.merge_tool.version = self.version logger.debug("zhangxm log: begin to register file!\n") # do file registering for sj in jobs: if sj.status=='completed': sj.application.register()
class ITransform(GangaObject): _schema = Schema(Version(1, 0), { 'status': SimpleItem(defvalue='new', protected=1, copyable=1, doc='Status - running, pause or completed', typelist=[str]), 'name': SimpleItem(defvalue='Simple Transform', doc='Name of the transform (cosmetic)', typelist=[str]), 'application': ComponentItem('applications', defvalue=None, optional=1, load_default=False, doc='Application of the Transform.'), 'inputsandbox': FileItem(defvalue=[], sequence=1, doc="list of File objects shipped to the worker node "), 'outputsandbox': SimpleItem(defvalue=[], typelist=[str], sequence=1, doc="list of filenames or patterns shipped from the worker node"), 'backend': ComponentItem('backends', defvalue=None, optional=1, load_default=False, doc='Backend of the Transform.'), 'splitter': ComponentItem('splitters', defvalue=None, optional=1, load_default=False, doc='Splitter used on each unit of the Transform.'), 'postprocessors': ComponentItem('postprocessor', defvalue=None, doc='list of postprocessors to run after job has finished'), 'merger': ComponentItem('mergers', defvalue=None, hidden=1, copyable=0, load_default=0, optional=1, doc='Merger to be done over all units when complete.'), 'unit_merger': ComponentItem('mergers', defvalue=None, load_default=0, optional=1, doc='Merger to be copied and run on each unit separately.'), 'copy_output': ComponentItem('datasets', defvalue=None, load_default=0, optional=1, doc='The dataset to copy all units output to, e.g. Grid dataset -> Local Dataset'), 'unit_copy_output': ComponentItem('datasets', defvalue=None, load_default=0, optional=1, doc='The dataset to copy each individual unit output to, e.g. Grid dataset -> Local Dataset'), 'run_limit': SimpleItem(defvalue=8, doc='Number of times a partition is tried to be processed.', protected=1, typelist=[int]), 'minor_run_limit': SimpleItem(defvalue=3, doc='Number of times a unit can be resubmitted', protected=1, typelist=[int]), 'major_run_limit': SimpleItem(defvalue=3, doc='Number of times a junit can be rebrokered', protected=1, typelist=[int]), 'units': ComponentItem('units', defvalue=[], sequence=1, copyable=1, doc='list of units'), 'inputdata': ComponentItem('datasets', defvalue=[], sequence=1, protected=1, optional=1, load_default=False, doc='Input datasets to run over'), 'outputdata': ComponentItem('datasets', defvalue=None, optional=1, load_default=False, doc='Output dataset template'), 'inputfiles': GangaFileItem(defvalue=[], sequence=1, doc="list of file objects that will act as input files for a job"), 'outputfiles' : GangaFileItem(defvalue=[], sequence=1, doc="list of OutputFile objects to be copied to all jobs"), 'metadata': ComponentItem('metadata', defvalue=MetadataDict(), doc='the metadata', protected=1), 'rebroker_on_job_fail': SimpleItem(defvalue=True, doc='Rebroker if too many minor resubs'), 'abort_loop_on_submit': SimpleItem(defvalue=True, doc='Break out of the Task Loop after submissions'), 'required_trfs': SimpleItem(defvalue=[], typelist=[int], sequence=1, doc="IDs of transforms that must complete before this unit will start. NOTE DOESN'T COPY OUTPUT DATA TO INPUT DATA. Use TaskChainInput Dataset for that."), 'chain_delay': SimpleItem(defvalue=0, doc='Minutes delay between a required/chained unit completing and starting this one', protected=0, typelist=[int]), 'submit_with_threads': SimpleItem(defvalue=False, doc='Use Ganga Threads for submission'), 'max_active_threads': SimpleItem(defvalue=10, doc='Maximum number of Ganga Threads to use. Note that the number of simultaneous threads is controlled by the queue system (default is 5)'), 'info' : SimpleItem(defvalue=[],typelist=[str],protected=1,sequence=1,doc="Info showing status transitions and unit info"), 'id': SimpleItem(defvalue=-1, protected=1, doc='ID of the Transform', typelist=[int]), #'force_single_unit' : SimpleItem(defvalue=False, doc='Force all input data into one Unit'), }) _category = 'transforms' _name = 'ITransform' _exportmethods = ['addInputData', 'resetUnit', 'setRunLimit', 'getJobs', 'setMinorRunLimit', 'setMajorRunLimit', 'getID', 'overview', 'resetUnitsByStatus', 'removeUnusedJobs', 'showInfo', 'showUnitInfo', 'pause', 'n_all', 'n_status' ] _hidden = 0 def showInfo(self): """Print out the info in a nice way""" print("\n".join( self.info )) def showUnitInfo(self, uid): """Print out the given unit info in a nice way""" self.units[uid].showInfo() def getJobs(self): """Return a list of the currently active job ids""" joblist = [] for u in self.units: joblist += u.active_job_ids return joblist def setMinorRunLimit(self, newRL): """Set the number of times a job will be resubmitted before a major resubmit is attempted""" self.minor_run_limit = newRL def setMajorRunLimit(self, newRL): """Set the number of times a job will be rebrokered before the transform is paused""" self.major_run_limit = newRL def setRunLimit(self, newRL): """Set the total (minor+major) number of times a job should be resubmitted before the transform is paused""" self.run_limit = newRL def overview(self, status=''): """Show the status of the units in this transform""" for unit in self.units: # display colour given state o = "" o += ("%d: " % self.units.index(unit)) + unit.name # is unit active? if unit.active: o += " " * (40 - len(o) + 3) + "*" else: o += " " * (40 - len(o) + 3) + "-" # sub job status o += "\t %i" % unit.n_status("submitted") o += "\t %i" % unit.n_status("running") o += "\t %i" % unit.n_status("completed") o += "\t %i" % unit.n_status("failed") o += "\t %i" % unit.minor_resub_count o += "\t %i" % unit.major_resub_count # change colour on state if unit.status == 'completed': o = markup(o, overview_colours["completed"]) elif not unit.active: o = markup(o, overview_colours["bad"]) elif unit.status == "recreating": o = markup(o, overview_colours["attempted"]) elif len(unit.active_job_ids) == 0: o = markup(o, overview_colours["hold"]) else: o = markup(o, overview_colours["running"]) print(o) # Special methods: def __init__(self): super(ITransform, self).__init__() self.initialize() def _auto__init__(self): self.status = 'new' def _readonly(self): """A transform is read-only if the status is not new.""" if self.status == "new": return 0 return 1 def initialize(self): from Ganga.Lib.Localhost.Localhost import Localhost self.backend = Localhost() def check(self): """Check this transform has valid data, etc. and has the correct units""" # ignore anything but new transforms if self.status != "new": return # first, validate the transform if not self.validate(): raise ApplicationConfigurationError( None, "Validate failed for Transform %s" % self.name) self.updateStatus("running") def startup(self): """This function is used to set the status after restarting Ganga""" pass # Public methods def resetUnit(self, uid): """Reset the given unit""" addInfoString( self, "Reseting Unit %i" % ( uid ) ) for u in self.units: if u.getID() == uid: u.reset() break # find any chained units and mark for recreation for trf in self._getParent().transforms: for u2 in trf.units: for req in u2.req_units: if req == "%d:%d" % (self.getID(), u.getID()) or req == "%d:ALL" % (self.getID()): trf.resetUnit(u2.getID()) self.updateStatus("running") def getID(self): """Return the index of this trf in the parent task""" # if the id isn't already set, use the index from the parent Task if self.id < 0: task = self._getParent() if not task: raise ApplicationConfigurationError( None, "This transform has not been associated with a task and so there is no ID available") self.id = task.transforms.index(self) return self.id def run(self, check=True): """Sets this transform to running status""" if self.status == "new" and check: self.check() if self.status != "completed": self.updateStatus("running") task = self._getParent() if task: task.updateStatus() else: logger.warning("Transform is already completed!") def update(self): """Called by the parent task to check for status updates, submit jobs, etc.""" if self.status == "pause" or self.status == "new": return 0 # check for complete required units task = self._getParent() for trf_id in self.required_trfs: if task.transforms[trf_id].status != "completed": return 0 # set the start time if not already set if len(self.required_trfs) > 0 and self.units[0].start_time == 0: for unit in self.units: unit.start_time = time.time() + self.chain_delay * 60 - 1 # report the info for this transform unit_status = { "new":0, "hold":0, "running":0, "completed":0, "bad":0, "recreating":0 } for unit in self.units: unit_status[unit.status] += 1 info_str = "Unit overview: %i units, %i new, %i hold, %i running, %i completed, %i bad. to_sub %i" % (len(self.units), unit_status["new"], unit_status["hold"], unit_status["running"], unit_status["completed"], unit_status["bad"], self._getParent().n_tosub()) addInfoString(self, info_str) # ask the unit splitter if we should create any more units given the # current data self.createUnits() # loop over units and update them ((re)submits will be called here) old_status = self.status unit_status_list = [] # find submissions first unit_update_list = [] for unit in self.units: if not unit.checkForSubmission() and not unit.checkForResubmission(): unit_update_list.append(unit) continue if unit.update() and self.abort_loop_on_submit: logger.info("Unit %d of transform %d, Task %d has aborted the loop" % ( unit.getID(), self.getID(), task.id)) return 1 unit_status_list.append(unit.status) # now check for download for unit in unit_update_list: if unit.update() and self.abort_loop_on_submit: logger.info("Unit %d of transform %d, Task %d has aborted the loop" % ( unit.getID(), self.getID(), task.id)) return 1 unit_status_list.append(unit.status) from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput # check for any TaskChainInput completions for ds in self.inputdata: if isType(ds, TaskChainInput) and ds.input_trf_id != -1: if task.transforms[ds.input_trf_id].status != "completed": return 0 # update status and check for state in ['running', 'hold', 'bad', 'completed']: if state in unit_status_list: if state == 'hold': state = "running" if state != self.status: self.updateStatus(state) break def createUnits(self): """Create new units if required given the inputdata""" from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput # check for chaining for ds in self.inputdata: if isType(ds, TaskChainInput) and ds.input_trf_id != -1: # check for single unit if ds.single_unit: # is there a unit already linked? done = False rec_unit = None for out_unit in self.units: if '%d:ALL' % (ds.input_trf_id) in out_unit.req_units: done = True # check if the unit is being recreated if out_unit.status == "recreating": rec_unit = out_unit break if not done or rec_unit: new_unit = self.createChainUnit( self._getParent().transforms[ds.input_trf_id].units, ds.use_copy_output) if new_unit: self.addChainUnitToTRF( new_unit, ds, -1, prev_unit=rec_unit) else: # loop over units in parent trf and create units as # required for in_unit in self._getParent().transforms[ds.input_trf_id].units: # is there a unit already linked? done = False rec_unit = None for out_unit in self.units: if '%d:%d' % (ds.input_trf_id, in_unit.getID()) in out_unit.req_units: done = True # check if the unit is being recreated if out_unit.status == "recreating": rec_unit = out_unit break if not done or rec_unit: new_unit = self.createChainUnit( [in_unit], ds.use_copy_output) if new_unit: self.addChainUnitToTRF( new_unit, ds, in_unit.getID(), prev_unit=rec_unit) def createChainUnit(self, parent_units, use_copy_output=True): """Create a chained unit given the parent outputdata""" return IUnit() def addChainUnitToTRF(self, unit, inDS, unit_id=-1, prev_unit=None): """Add a chained unit to this TRF. Override for more control""" if unit_id == -1: unit.req_units.append('%d:ALL' % (inDS.input_trf_id)) unit.name = "Parent: TRF %d, All Units" % (inDS.input_trf_id) else: unit.req_units.append('%d:%d' % (inDS.input_trf_id, unit_id)) unit.name = "Parent: TRF %d, Unit %d" % ( inDS.input_trf_id, unit_id) self.addUnitToTRF(unit, prev_unit) def addInputData(self, inDS): """Add the given input dataset to the list""" self.inputdata.append(inDS) def pause(self): """Pause the task - the background thread will not submit new jobs from this task""" if self.status != "completed": self.updateStatus("pause") #self.status = "pause" task = self._getParent() if task: task.updateStatus() else: logger.debug("Transform is already completed!") def setRunlimit(self, newRL): """Set the number of times a job should be resubmitted before the transform is paused""" self.run_limit = newRL logger.debug("Runlimit set to %i", newRL) # Methods that can/should be overridden by derived classes def validate(self): """Override this to validate that the transform is OK""" from Ganga.GPIDev.Lib.Tasks.TaskLocalCopy import TaskLocalCopy # make sure a path has been selected for any local downloads if self.unit_copy_output is not None and isType(self.unit_copy_output, TaskLocalCopy): if self.unit_copy_output.local_location == '': logger.error("No path selected for Local Output Copy") return False if self.copy_output is not None and isType(self.copy_output, TaskLocalCopy): if self.copy_output.local_location == '': logger.error("No path selected for Local Output Copy") return False # this is a generic trf so assume the application and splitter will do # all the work return True def addUnitToTRF(self, unit, prev_unit=None): """Add a unit to this Transform given the input and output data""" if not unit: raise ApplicationConfigurationError(None, "addUnitTOTRF failed for Transform %d (%s): No unit specified" % (self.getID(), self.name)) addInfoString( self, "Adding Unit to TRF...") unit.updateStatus("hold") unit.active = True if prev_unit: unit.prev_job_ids += prev_unit.prev_job_ids self.units[prev_unit.getID()] = unit else: self.units.append(unit) stripProxy(unit).id = len(self.units) - 1 # Information methods def fqn(self): task = self._getParent() if task: return "Task %i Transform %i" % (task.id, task.transforms.index(self)) else: return "Unassigned Transform '%s'" % (self.name) def n_active(self): return sum([u.n_active() for u in self.units]) def n_all(self): return sum([u.n_all() for u in self.units]) def n_status(self, status): return sum([u.n_status(status) for u in self.units]) def info(self): logger.info(markup("%s '%s'" % (getName(self), self.name), status_colours[self.status])) logger.info("* backend: %s" % getName(self.backend)) logger.info("Application:") self.application.printTree() def updateStatus(self, status): """Update the transform status""" self.status = status def createUnitCopyOutputDS(self, unit_id): """Create a the Copy Output dataset to use with this unit. Overload to handle more than the basics""" from Ganga.GPIDev.Lib.Tasks.TaskLocalCopy import TaskLocalCopy if isType(self.unit_copy_output, TaskLocalCopy): logger.warning("Default implementation of createUnitCopyOutputDS can't handle datasets of type '%s'" % getName(self.unit_copy_output)) return # create copies of the Copy Output DS and add Unit name to path self.units[unit_id].copy_output = self.unit_copy_output.clone() self.units[unit_id].copy_output.local_location = os.path.join( self.unit_copy_output.local_location, self.units[unit_id].name.replace(":", "_").replace(" ", "").replace(",", "_")) def __setattr__(self, attr, value): if attr == 'outputfiles': if value != []: if self.outputdata is not None: logger.error( 'ITransform.outputdata is set, you can\'t set ITransform.outputfiles') return elif self.outputsandbox != []: logger.error( 'ITransform.outputsandbox is set, you can\'t set ITransform.outputfiles') return # reduce duplicate values here, leave only duplicates for LCG, # where we can have replicas uniqueValuesDict = [] uniqueValues = [] for val in value: key = '%s%s' % (getName(val), val.namePattern) if key not in uniqueValuesDict: uniqueValuesDict.append(key) uniqueValues.append(val) elif getName(val) == 'LCGSEFile': uniqueValues.append(val) super(ITransform, self).__setattr__(attr, uniqueValues) elif attr == 'inputfiles': if value != []: if self.inputsandbox != []: logger.error( 'ITransform.inputsandbox is set, you can\'t set ITransform.inputfiles') return super(ITransform, self).__setattr__(attr, value) elif attr == 'outputsandbox': if value != []: if getConfig('Output')['ForbidLegacyOutput']: logger.error( 'Use of ITransform.outputsandbox is forbidden, please use ITransform.outputfiles') return if self.outputfiles != []: logger.error( 'ITransform.outputfiles is set, you can\'t set ITransform.outputsandbox') return super(ITransform, self).__setattr__(attr, value) elif attr == 'inputsandbox': if value != []: if getConfig('Output')['ForbidLegacyInput']: logger.error( 'Use of ITransform.inputsandbox is forbidden, please use ITransform.inputfiles') return if self.inputfiles != []: logger.error( 'ITransform.inputfiles is set, you can\'t set ITransform.inputsandbox') return super(ITransform, self).__setattr__(attr, value) elif attr == 'outputdata': if value is not None: if getConfig('Output')['ForbidLegacyOutput']: logger.error( 'Use of ITransform.outputdata is forbidden, please use ITransform.outputfiles') return if self.outputfiles != []: logger.error( 'ITransform.outputfiles is set, you can\'t set ITransform.outputdata') return super(ITransform, self).__setattr__(attr, value) else: super(ITransform, self).__setattr__(attr, value) def resetUnitsByStatus(self, status='bad'): """Reset all units of a given status""" for unit in self.units: if unit.status == status: logger.info("Resetting Unit %d, Transform %d..." % (unit.getID(), self.getID())) self.resetUnit(unit.getID()) def checkUnitsAreCompleted(self, parent_units): """Check the given parent units are complete""" for parent in parent_units: if len(parent.active_job_ids) == 0 or parent.status != "completed": return False return True def getChainInclExclMasks(self, parent_units): """return the include/exclude masks from the TaskChainInput""" incl_pat_list = [] excl_pat_list = [] from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput for parent in parent_units: for inds in self.inputdata: if isType(inds, TaskChainInput) and inds.input_trf_id == parent._getParent().getID(): incl_pat_list += inds.include_file_mask excl_pat_list += inds.exclude_file_mask return incl_pat_list, excl_pat_list def getParentUnitJobs(self, parent_units, include_subjobs=True): """Return the list of parent jobs""" job_list = [] for parent in parent_units: job = getJobByID(parent.active_job_ids[0]) if job.subjobs: job_list += job.subjobs else: job_list += [job] return job_list def removeUnusedJobs(self): """Remove all jobs that aren't being used, e.g. failed jobs""" for unit in self.units: for jid in unit.prev_job_ids: try: logger.warning("Removing job '%d'..." % jid) job = getJobByID(jid) job.remove() except Exception as err: logger.debug("removeUnused: %s" % str(err)) logger.error("Problem removing job '%d'" % jid)
class Root(IPrepareApp): """ Root application -- running ROOT To run a job in ROOT you need to specify the CINT script to be executed. Additional files required at run time (shared libraries, source files, other scripts, Ntuples) should be placed in the inputsandbox of the job. Arguments can be passed onto the script using the 'args' field of the application. Defining a Simple Job: As an example the script analysis.C in the directory ~/abc might contain: void analysis(const char* type, int events) { std::cout << type << " " << events << std::endl; } To define an LCG job on the Ganga command line with this script, running in ROOT version 5.14.00b with the arguments 'MinBias' and 10, you would do the following: r = Root() r.version = '6.04.02' r.script = '~/abc/analysis.C' r.args = ['Minbias', 10] j = Job(application=r, backend=LCG()) Using Shared Libraries: If you have private shared libraries that should be loaded you need to include them in the inputsandbox. Files you want back as a result of running your job should be placed in your outputsandbox. The shared library mechanism is particularly useful in order to create a thin wrapper around code that uses precompiled libraries, or that has not been designed to work in the CINT environment. **For more detailed instructions, see the following Wiki page:** https://twiki.cern.ch/twiki/bin/view/ArdaGrid/HowToRootJobsSharedObject A summary of this page is given below: Consider the follow in CINT script, runMain.C, that makes use of a ROOT compatible shared library: void runMain(){ //set up main, eg command line opts char* argv[] = {"runMain.C","--muons","100"}; int argc = 3; //load the shared library gSystem->Load("libMain"); //run the code Main m(argv,argc); int returnCode = m.run(); } The class Main is as follows and has been compiled into a shared library, libMain.so. Main.h: #ifndef MAIN_H #define MAIN_H #include "TObject.h" class Main : public TObject { public: Main(){}//needed by Root IO Main(char* argv[], int argc); int run(); ClassDef(Main,1)//Needed for CINT }; #endif Main.cpp: #include <iostream> using std::cout; using std::endl; #include "Main.h" ClassImp(Main)//needed for CINT Main::Main(char* arvv[], int argc){ //do some setup, command line opts etc } int Main::run(){ cout << "Running Main..." << endl; return 0; } To run this on LCG, a Job could be created as follows: r = Root() r.version = '5.12.00' #version must be on LCG external site r.script = 'runMain.C' j = Job(application=r,backend=LCG()) j.inputsandbox = ['libMain.so'] It is a requirement that your script contains a function with the same name as the script itself and that the shared library file is built to be binary compatible with the Grid environment (e.g. same architecture and version of gcc). As shown above, the wrapper class must be made CINT compatible. This restriction does not, however, apply to classes used by the wrapper class. When running remote (e.g. LCG) jobs, the architecture used is 'slc3_ia32_gcc323' if the Root version is 5.16 or earlier and 'slc4_ia32_gcc34' otherwise. This reflects the availability of builds on the SPI server: http://service-spi.web.cern.ch/service-spi/external/distribution/ For backends that use a local installation of ROOT the location should be set correctly in the [Root] section of the configuration. Using Python and Root: The Root project provides bindings for Python, the language supported by the Ganga command line interface. These bindings are referred to as PyRoot. A job is run using PyRoot if the script has the '.py' extension or the usepython flag is set to True. There are many example PyRoot scripts available in the Root tutorials. A short example is given below: gengaus.py: if __name__ == '__main__': from ROOT import gRandom output = file('gaus.txt','w') try: for i in range(100): print(gRandom.Gaus(), file=output) finally: output.close() The above script could be run in Ganga as follows: r = Root() r.version = '5.14.00' r.script = '~/gengaus.py' r.usepython = True #set automatically for '.py' scripts j = Job(application=r,backend=Local()) j.outputsandbox = ['gaus.txt'] j.submit() When running locally, the python interpreter used for running PyRoot jobs will default to the one being used in the current Ganga session. The Root binaries selected must be binary compatible with this version. The pythonhome variable in the [Root] section of .gangarc controls which interpreter will be used for PyRoot jobs. When using PyRoot on a remote backend, e.g. LCG, the python version that is used will depend on that used to build the Root version requested. """ _schema = Schema( Version(1, 1), { 'script': FileItem( defvalue=None, preparable=1, doc= 'A File object specifying the script to execute when Root starts', checkset='_checkset_script'), 'args': SimpleItem( defvalue=[], typelist=[str, int], sequence=1, doc= "List of arguments for the script. Accepted types are numerics and strings" ), 'version': SimpleItem(defvalue='6.04.02', doc="The version of Root to run"), 'usepython': SimpleItem( defvalue=False, doc= "Execute 'script' using Python. The PyRoot libraries are added to the PYTHONPATH." ), 'is_prepared': SimpleItem( defvalue=None, strict_sequence=0, visitable=1, copyable=1, typelist=[None, bool], protected=1, hidden=0, comparable=1, doc= 'Location of shared resources. Presence of this attribute implies the application has been prepared.' ), 'hash': SimpleItem( defvalue=None, typelist=[None, str], hidden=1, doc= 'MD5 hash of the string representation of applications preparable attributes' ) }) _category = 'applications' _name = 'Root' _exportmethods = ['prepare', 'unprepare'] def __init__(self): super(Root, self).__init__() from Ganga.GPIDev.Lib.File import getSharedPath self.shared_path = Ganga.GPIDev.Lib.File.getSharedPath() if self.script is None or self.script == File(): self.script = getDefaultScript() def configure(self, masterappconfig): return (None, None) def unprepare(self, force=False): """ Revert a Root() application back to it's unprepared state. """ logger.debug('Running unprepare in Root app') if self.is_prepared is not None: self.decrementShareCounter(self.is_prepared) self.is_prepared = None self.hash = None def prepare(self, force=False): """ A method to place the Root application into a prepared state. """ if (self.is_prepared is not None) and (force is not True): raise ApplicationPrepareError( '%s application has already been prepared. Use prepare(force=True) to prepare again.' % getName(self)) self.is_prepared = ShareDir() logger.info('Created shared directory: %s' % (self.is_prepared.name)) try: copy_worked = self.copyPreparables() if copy_worked == 0: logger.error( 'Failed during prepare() phase. Unpreparing application.') self.unprepare() return 0 else: # add the newly created shared directory into the metadata # system if the app is associated with a persisted object self.checkPreparedHasParent(self) self.post_prepare() return 1 except: self.unprepare() raise def _checkset_script(self, value): """Callback used to set usepython to 1 if the script name has a *.py or *.PY extention.""" from os.path import splitext (_, ext) = splitext(str(value.name)) # use pyroot if this is a python script if ('.py' == ext.lower()): logger.debug('Setting usepython to True') self.usepython = True
class Gaudi(GaudiBase): _name = 'Gaudi' __doc__ = GaudiDocString(_name) _category = 'applications' _exportmethods = GaudiBase._exportmethods[:] _exportmethods += ['prepare', 'unprepare'] _hidden = 1 _schema = GaudiBase._schema.inherit_copy() docstr = 'The gaudirun.py cli args that will be passed at run-time' _schema.datadict['args'] = SimpleItem(defvalue=['-T'], sequence=1, strict_sequence=0, typelist=['str', 'type(None)'], doc=docstr) docstr = 'The name of the optionsfile. Import statements in the file ' \ 'will be expanded at submission time and a full copy made' _schema.datadict['optsfile'] = FileItem(preparable=1, sequence=1, strict_sequence=0, defvalue=[], doc=docstr) docstr = 'A python configurable string that will be appended to the ' \ 'end of the options file. Can be multiline by using a ' \ 'notation like \nHistogramPersistencySvc().OutputFile = ' \ '\"myPlots.root"\\nEventSelector().PrintFreq = 100\n or by ' \ 'using triple quotes around a multiline string.' _schema.datadict['extraopts'] = SimpleItem(preparable=1, defvalue=None, typelist=['str', 'type(None)'], doc=docstr) _schema.version.major += 0 _schema.version.minor += 0 def _auto__init__(self): """bootstrap Gaudi applications. If called via a subclass set up some basic structure like version platform...""" self._init() def _parse_options(self): raise NotImplementedError def prepare(self, force=False): from Ganga.GPIDev.Lib.GangaList.GangaList import GangaList from Ganga.GPIDev.Lib.File.File import File if isType(self.optsfile, (list, tuple, GangaList)): for this_file in self.optsfile: if type(this_file) is str: this_file = File(this_file) else: continue elif type(self.optsfile) is str: self.optsfile = [File(self.optsfile)] try: super(Gaudi, self).prepare(force) except Exception, err: logger.debug("Super Prepare Error:\n%s" % str(err)) raise err logger.debug("Prepare") _is_prepared = self.is_prepared #logger.info("_is_prepared: %s" % _is_prepared) share_dir = os.path.join( expandfilename(getConfig('Configuration')['gangadir']), 'shared', getConfig('Configuration')['user'], _is_prepared.name) # We will return a list of files 'send_to_share' which will be copied into the jobs # inputsandbox when prepare called from job object. NOTE that these files will not go # into an inputsandbox when prepare called on standalone app. # Things in the inputsandbox end up in the working dir at runtime. # Exception is just re-thrown here after setting is_prepared to None # could have done the setting in the actual functions but didnt want # prepared state altered from the readInputData pseudo-static member try: self._check_inputs() except Exception, err: logger.debug("_check_inputs Error:\n%s" % str(err)) self.unprepare() raise err
class Transform(GangaObject): _schema = Schema( Version(1, 0), { 'status': SimpleItem(defvalue='new', protected=1, copyable=0, doc='Status - running, pause or completed', typelist=["str"]), 'name': SimpleItem(defvalue='Simple Transform', doc='Name of the transform (cosmetic)', typelist=["str"]), 'application': ComponentItem( 'applications', defvalue=None, optional=1, load_default=False, filter="checkTaskApplication", doc= 'Application of the Transform. Must be a Task-Supporting application.' ), 'inputsandbox': FileItem(defvalue=[], typelist=['str', 'Ganga.GPIDev.Lib.File.File.File'], sequence=1, doc="list of File objects shipped to the worker node "), 'outputsandbox': SimpleItem( defvalue=[], typelist=['str'], sequence=1, doc="list of filenames or patterns shipped from the worker node" ), 'inputdata': ComponentItem('datasets', defvalue=None, optional=1, load_default=False, doc='Input dataset'), 'outputdata': ComponentItem('datasets', defvalue=None, optional=1, load_default=False, doc='Output dataset'), 'backend': ComponentItem('backends', defvalue=None, optional=1, load_default=False, doc='Backend of the Transform.'), 'run_limit': SimpleItem( defvalue=4, doc='Number of times a partition is tried to be processed.', protected=1, typelist=["int"]), '_partition_status': SimpleItem(defvalue={}, hidden=1, copyable=0, doc='Map (only necessary) partitions to their status'), '_app_partition': SimpleItem(defvalue={}, hidden=1, copyable=0, doc='Map of applications to partitions'), '_app_status': SimpleItem(defvalue={}, hidden=1, copyable=0, doc='Map of applications to status'), '_next_app_id': SimpleItem(defvalue=0, hidden=1, copyable=0, doc='Next ID used for the application', typelist=["int"]), }) _category = 'transforms' _name = 'Transform' _exportmethods = [ 'run', 'pause', # Operations 'setPartitionStatus', 'setRunlimit', 'setFailed', # Control Partitions 'getPartitionStatus', 'getJobs', 'getPartitionJobs', # Info 'overview', 'info', 'n_all', 'n_status', 'retryFailed' ] # _app_status = {} _partition_apps = None # possible partition status values: # ignored, hold, ready, running, completed, attempted, failed, bad # Special methods: def __init__(self): super(Transform, self).__init__() self.initialize() def _readonly(self): """A transform is read-only if the status is not new.""" if self.status == "new": return 0 return 1 def initialize(self): from Ganga import GPI self.backend = stripProxy(GPI.Local()) def check(self): pass def startup(self): """This function is used to set the status after restarting Ganga""" # Make sure that no partitions are kept "running" from previous # sessions clist = self._partition_status.keys() for c in clist: self.updatePartitionStatus(c) # At this point the applications still need to notify the Transformation of their status # Search jobs for task-supporting applications id = "%i:%i" % (self._getParent().id, self._getParent().transforms.index(self)) for j in GPI.jobs: if "tasks_id" in stripProxy(j.application).getNodeData(): # print "tasks_id of jobid ", j.fqid, # stripProxy(j.application).getNodeAttribute("tasks_id"), id if stripProxy(j.application).getNodeAttribute( "tasks_id").endswith(id): try: if j.subjobs: for sj in j.subjobs: app = stripProxy(sj.application) stripProxy(app.getTransform()).setAppStatus( app, app._getParent().status) else: app = stripProxy(j.application) stripProxy(app.getTransform()).setAppStatus( app, app._getParent().status) except AttributeError as e: logger.error("%s", e) def getPartitionApps(self): if self._partition_apps is None: # Create the reverse map _partition_apps from _app_partition self._partition_apps = {} for (app, partition) in self._app_partition.iteritems(): if partition in self._partition_apps: if not app in self._partition_apps[partition]: self._partition_apps[partition].append(app) else: self._partition_apps[partition] = [app] return self._partition_apps def fix(self): """This function fixes inconsistencies in application status""" # Create the reverse map _partition_apps from _app_partition self._app_status = {} # Make sure that no partitions are kept "running" from previous # sessions clist = self._partition_status.keys() for c in clist: self.updatePartitionStatus(c) # At this point the applications still need to notify the Transformation of their status # Search jobs for task-supporting applications id = "%i:%i" % (self._getParent().id, self._getParent().transforms.index(self)) for j in GPI.jobs: if "tasks_id" in stripProxy(j.application).getNodeData(): if stripProxy( j.application).getNodeAttribute("tasks_id") == id: try: if j.subjobs: for sj in j.subjobs: app = stripProxy(sj.application) stripProxy(app.getTransform()).setAppStatus( app, app._getParent().status) else: app = stripProxy(j.application) stripProxy(app.getTransform()).setAppStatus( app, app._getParent().status) except AttributeError as e: logger.error("%s", e) # Public methods def run(self, check=True): """Sets this transform to running status""" if self.status == "new" and check: self.check() if self.status != "completed": self.updateStatus("running") #self.status = "running" # Check if this transform has completed in the meantime is_complete = True for s in self._partition_status.values(): if s != "completed" and s != "bad": is_complete = False break if is_complete: self.updateStatus("completed") #self.status = "completed" task = self._getParent() if task: task.updateStatus() else: logger.warning("Transform is already completed!") def pause(self): """Pause the task - the background thread will not submit new jobs from this task""" if self.status != "completed": self.updateStatus("pause") #self.status = "pause" task = self._getParent() if task: task.updateStatus() else: logger.debug("Transform is already completed!") def setRunlimit(self, newRL): """Set the number of times a job should be resubmitted before the transform is paused""" self.run_limit = newRL cs = self._partition_status.items() for (c, s) in cs: if s in ["attempted", "failed"]: failures = self.getPartitionFailures(c) if failures >= newRL: self._partition_status[c] = "failed" else: self._partition_status[c] = "attempted" logger.debug("Runlimit set to %i", newRL) def setPartitionStatus(self, partition, status): """ Set the Status of the given partition to "ready", "hold", "bad" or "completed". The status is then updated to the status indicated by the applications""" self.setPartitionsStatus([partition], status) def getJobs(self): """ Get the job slice of all jobs for this transform """ return self.getPartitionJobs(None) def getPartitionJobs(self, partition): """ Get the job slice that processed the given partition. Iterates over the job list. """ task = self._getParent() id = task.transforms.index(self) if partition is None: sname = "tasks(%i).transforms[%i].getJobs()" % (task.id, id) else: sname = "tasks(%i).transforms[%i].getPartitionJobs(%s)" % ( task.id, id, partition) jobslice = JobRegistrySlice(sname) def addjob(j): if partition is None or self._app_partition[ j.application.id] == partition: jobslice.objects[j.fqid] = stripProxy(j) for j in GPI.jobs: try: stid = j.application.tasks_id.split(":") if int(stid[-2]) == task.id and int(stid[-1]) == id: if j.subjobs: for sj in j.subjobs: addjob(sj) else: addjob(j) except Exception as err: logger.debug("getPartitionJobs Exception:\n%s" % str(err)) pass return JobRegistrySliceProxy(jobslice) def setFailed(self, partition): """ Tells Tasks that all Applications that have executed this partition have actually failed.""" for aid in self._app_partition: if aid in self._app_status and self._app_status[aid] == "removed": continue # Save the status self._app_status[aid] = "failed" # Update the corresponding partition status self.setPartitionStatus(partition, "ready") def retryFailed(self): """Retry all failed partitions (forget about failed jobs)""" for aid in self._app_partition: if aid in self._app_status and self._app_status[aid] == "failed": self._app_status[aid] = "removed" clist = self._partition_status.keys() for c in clist: self.updatePartitionStatus(c) # Internal methods def finalise(self): """Finalise the transform - no-op by default""" return def submitJobs(self, n): """Create Ganga Jobs for the next N partitions that are ready and submit them.""" next = self.getNextPartitions(n) if len(next) == 0: return 0 numjobs = 0 for j in self.getJobsForPartitions(next): stripProxy(j.application).transition_update("submitting") try: j.submit() except JobError: logger.error( "Error on job submission! The current transform will be paused until this problem is fixed." ) logger.error( "type tasks(%i).run() to continue after the problem has been fixed.", self._getParent().id) self.pause() numjobs += 1 return numjobs def checkTaskApplication(self, app): """warns the user if the application is not compatible """ if app is None: return None if not "tasks_id" in stripProxy(app).getNodeData(): return taskApp(app) return app def setAppStatus(self, app, new_status): """Reports status changes in application jobs possible status values: normal : (new, submitting,) submitted, running, completing, completed failures : killed, failed transient: incomplete (->new), unknown, removed""" # Check if we know the occurring application... if app.id == -1: return if not app.id in self._app_partition: logger.warning("%s was contacted by an unknown application %i.", self.fqn(), app.id) return # Silently ignore message if the application is already removed or # completed if app.id in self._app_status and self._app_status[app.id] in [ "removed", "completed", "failed" ]: return # Check the status if new_status == "completed" and not self.checkCompletedApp(app): logger.error("%s app %i failed despite listed as completed!", self.fqn(), app.id) new_status = "failed" # Save the status self._app_status[app.id] = new_status # Update the corresponding partition status self.updatePartitionStatus(self._app_partition[app.id]) def setMasterJobStatus(self, job, new_status): """hook for a master job status update""" return def updatePartitionStatus(self, partition): """ Calculate the correct status of the given partition. "completed" and "bad" is never changed here "hold" is only changed to "completed" here. """ # print "updatePartitionStatus ", partition, " transform ", self.id # If the partition has status, and is not in a fixed state, check it! if partition in self._partition_status and ( not self._partition_status[partition] in ["bad", "completed"]): # if we have no applications, we are in "ready" state if not partition in self.getPartitionApps(): if self._partition_status[partition] != "hold": self._partition_status[partition] = "ready" else: status = [ self._app_status[app] for app in self.getPartitionApps()[partition] if app in self._app_status and not self._app_status[app] in ["removed", "killed"] ] # Check if we have completed this partition if "completed" in status: self._partition_status[partition] = "completed" # Check if we are not on hold elif self._partition_status[partition] != "hold": # Check if we are running running = False for stat in [ "completing", "running", "submitted", "submitting" ]: if stat in status: self._partition_status[partition] = "running" running = True break if not running: # Check if we failed #failures = len([stat for stat in status if stat in ["failed","new"]]) failures = self.getPartitionFailures(partition) if failures >= self.run_limit: self._partition_status[partition] = "failed" elif failures > 0: self._partition_status[partition] = "attempted" else: # Here we only have some "unknown" applications # This could prove difficult when launching new applications. Care has to be taken # to get the applications out of "unknown" stats as quickly as possible, to avoid double submissions. #logger.warning("Partition with only unknown applications encountered. This is probably not a problem.") self._partition_status[partition] = "ready" # Notify the next transform (if any) of the change in input status self.notifyNextTransform(partition) # Update the Tasks status if necessary task = self._getParent() if partition in self._partition_status and self._partition_status[ partition] in ["completed", "bad" ] and self.status == "running": for s in self._partition_status.values(): if s != "completed" and s != "bad": return #self.status = "completed" self.updateStatus("completed") if task: task.updateStatus() elif self.status == "completed": for s in self._partition_status.values(): if s != "completed" and s != "bad": self.updateStatus("running") #self.status = "running" if task: task.updateStatus() return def notifyNextTransform(self, partition): """ Notify any dependant transforms of the input update """ task = self._getParent() if task and (task.transforms.index(self) + 1 < len(task.transforms)): task.transforms[task.transforms.index(self) + 1].updateInputStatus( self, partition) def setPartitionsStatus(self, partitions, status): """ Set the Status of the partitions to "ready", "hold", "bad" or "completed". The status is then updated to the status indicated by the applications "bad" and "completed" is never changed except to "ignored", "hold" is only changed to "completed". """ if status == "ignored": [ self._partition_status.pop(c) for c in partitions if c in self._partition_status ] elif status in ["ready", "hold", "bad", "completed"]: for c in partitions: self._partition_status[c] = status else: logger.error( "setPartitionsStatus called with invalid status string %s", status) for c in partitions: self.updatePartitionStatus(c) def setPartitionsLimit(self, limitpartition): """ Set all partitions from and including limitpartition to ignored """ partitions = [c for c in self._partition_status if c >= limitpartition] self.setPartitionsStatus(partitions, "ignored") def getPartitionStatus(self, partition): if partition in self._partition_status: return self._partition_status[partition] else: return "ignored" def getNextPartitions(self, n): """Returns the N next partitions to process""" partitionlist = sorted(c for c, v in self._partition_status.items() if v in ["ready", "attempted"]) return partitionlist[:n] def getNewAppID(self, partition): """ Returns a new application ID and associates this ID with the partition given. """ id = self._next_app_id self._app_partition[id] = partition if partition in self.getPartitionApps(): self.getPartitionApps()[partition].append(id) else: self.getPartitionApps()[partition] = [id] self._next_app_id += 1 return id def createNewJob(self, partition): """ Returns a new job initialized with the transforms application, backend and name """ task = self._getParent( ) # this works because createNewJob is only called by a task id = task.transforms.index(self) j = GPI.Job() stripProxy(j).backend = self.backend.clone() stripProxy(j).application = self.application.clone() stripProxy(j).application.tasks_id = "%i:%i" % (task.id, id) stripProxy(j).application.id = self.getNewAppID(partition) j.inputdata = self.inputdata j.outputdata = self.outputdata j.inputsandbox = self.inputsandbox j.outputsandbox = self.outputsandbox j.name = "T%i:%i C%i" % (task.id, id, partition) return j # Methods that can/should be overridden by derived classes def checkCompletedApp(self, app): """Can be overriden to improve application completeness checking""" return True def updateInputStatus(self, ltf, partition): """Is called my the last transform (ltf) if the partition 'partition' changes status""" # per default no dependencies exist pass def getJobsForPartitions(self, partitions): """This is only an example, this class should be overridden by derived classes""" return [self.createNewJob(p) for p in partitions] # Information methods def fqn(self): task = self._getParent() if task: return "Task %i Transform %i" % (task.id, task.transforms.index(self)) else: return "Unassigned Transform '%s'" % (self.name) def n_all(self): return len(self._partition_status) def n_status(self, status): return len( [cs for cs in self._partition_status.values() if cs == status]) def overview(self): """ Get an ascii art overview over task status. Can be overridden """ task = self._getParent() if not task is None: id = str(task.transforms.index(self)) else: id = "?" o = markup("#%s: %s '%s'\n" % (id, getName(self), self.name), status_colours[self.status]) i = 0 partitions = sorted(self._partition_status.keys()) for c in partitions: s = self._partition_status[c] if c in self.getPartitionApps(): failures = self.getPartitionFailures(c) o += markup("%i:%i " % (c, failures), overview_colours[s]) else: o += markup("%i " % c, overview_colours[s]) i += 1 if i % 20 == 0: o += "\n" logger.info(o) def info(self): logger.info( markup("%s '%s'" % (getName(self), self.name), status_colours[self.status])) logger.info("* backend: %s" % getName(self.backend)) logger.info("Application:") self.application.printTree() def getPartitionFailures(self, partition): """Return the number of failures for this partition""" return len([ 1 for app in self.getPartitionApps()[partition] if app in self._app_status and self._app_status[app] in ["new", "failed"] ]) def updateStatus(self, status): """Update the transform status""" self.status = status
class IUnit(GangaObject): _schema = Schema( Version(1, 0), { 'status': SimpleItem(defvalue='new', protected=1, copyable=0, doc='Status - running, pause or completed', typelist=["str"]), 'name': SimpleItem(defvalue='Simple Unit', doc='Name of the unit (cosmetic)', typelist=["str"]), 'application': ComponentItem('applications', defvalue=None, optional=1, load_default=False, doc='Application of the Transform.'), 'inputdata': ComponentItem('datasets', defvalue=None, optional=1, load_default=False, doc='Input dataset'), 'outputdata': ComponentItem('datasets', defvalue=None, optional=1, load_default=False, doc='Output dataset'), 'active': SimpleItem(defvalue=False, hidden=1, doc='Is this unit active'), 'active_job_ids': SimpleItem(defvalue=[], typelist=['int'], sequence=1, hidden=1, doc='Active job ids associated with this unit'), 'prev_job_ids': SimpleItem(defvalue=[], typelist=['int'], sequence=1, hidden=1, doc='Previous job ids associated with this unit'), 'minor_resub_count': SimpleItem(defvalue=0, hidden=1, doc='Number of minor resubmits'), 'major_resub_count': SimpleItem(defvalue=0, hidden=1, doc='Number of major resubmits'), 'req_units': SimpleItem( defvalue=[], typelist=['str'], sequence=1, hidden=1, doc= 'List of units that must complete for this to start (format TRF_ID:UNIT_ID)' ), 'start_time': SimpleItem( defvalue=0, hidden=1, doc='Start time for this unit. Allows a delay to be put in'), 'copy_output': ComponentItem( 'datasets', defvalue=None, load_default=0, optional=1, doc= 'The dataset to copy the output of this unit to, e.g. Grid dataset -> Local Dataset' ), 'merger': ComponentItem('mergers', defvalue=None, load_default=0, optional=1, doc='Merger to be run after this unit completes.'), 'splitter': ComponentItem('splitters', defvalue=None, optional=1, load_default=False, doc='Splitter used on each unit of the Transform.'), 'postprocessors': ComponentItem( 'postprocessor', defvalue=None, doc='list of postprocessors to run after job has finished'), 'inputsandbox': FileItem(defvalue=[], typelist=['str', 'Ganga.GPIDev.Lib.File.File.File'], sequence=1, doc="list of File objects shipped to the worker node "), 'inputfiles': GangaFileItem( defvalue=[], typelist=[ 'str', 'Ganga.GPIDev.Adapters.IGangaFile.IGangaFile' ], sequence=1, doc= "list of file objects that will act as input files for a job"), 'outputfiles': GangaFileItem( defvalue=[], typelist=[ 'str', 'Ganga.GPIDev.Adapters.IGangaFile.IGangaFile' ], sequence=1, doc="list of OutputFile objects to be copied to all jobs"), 'info': SimpleItem(defvalue=[], typelist=['str'], protected=1, sequence=1, doc="Info showing status transitions and unit info"), 'id': SimpleItem(defvalue=-1, protected=1, doc='ID of the Unit', typelist=["int"]), }) _category = 'units' _name = 'IUnit' _exportmethods = [] _hidden = 0 # Special methods: def __init__(self): super(IUnit, self).__init__() self.updateStatus("new") def _readonly(self): """A unit is read-only if the status is not new.""" if self.status == "new": return 0 return 1 def validate(self): """Validate that this unit is OK and set it to active""" self.active = True return True def getID(self): """Get the ID of this unit within the transform""" # if the id isn't already set, use the index from the parent Task if self.id < 0: trf = self._getParent() if not trf: raise ApplicationConfigurationError( None, "This unit has not been associated with a transform and so there is no ID available" ) self.id = trf.units.index(self) return self.id def updateStatus(self, status): """Update status hook""" addInfoString( self, "Status change from '%s' to '%s'" % (self.status, status)) self.status = status def createNewJob(self): """Create any jobs required for this unit""" pass def checkCompleted(self, job): """Check if this unit is complete""" if job.status == "completed": return True else: return False def checkForSubmission(self): """Check if this unit should submit a job""" # check the delay if time.time() < self.start_time: return False # check if we already have a job if len(self.active_job_ids) != 0: return False # if we're using threads, check the max number if self._getParent( ).submit_with_threads and GPI.queues.totalNumUserThreads( ) > self._getParent().max_active_threads: return False return True def checkForResubmission(self): """check if this unit should be resubmitted""" # check if we already have a job if len(self.active_job_ids) == 0: return False else: job = GPI.jobs(self.active_job_ids[0]) if job.status in ["failed", "killed"]: return True return False def checkParentUnitsAreComplete(self): """Check to see if the parent units are complete""" req_ok = True task = self._getParent()._getParent() for req in self.req_units: req_trf_id = int(req.split(":")[0]) if req.find("ALL") == -1: req_unit_id = int(req.split(":")[1]) if task.transforms[req_trf_id].units[ req_unit_id].status != "completed": req_ok = False else: # need all units from this trf for u in task.transforms[req_trf_id].units: if u.status != "completed": req_ok = False return req_ok def checkMajorResubmit(self, job): """check if this job needs to be fully rebrokered or not""" pass def majorResubmit(self, job): """perform a mjor resubmit/rebroker""" self.prev_job_ids.append(job.id) self.active_job_ids.remove(job.id) def minorResubmit(self, job): """perform just a minor resubmit""" try: trf = self._getParent() except Exception as err: logger.debug("GetParent exception!\n%s" % str(err)) trf = None if trf is not None and trf.submit_with_threads: addInfoString(self, "Attempting job re-submission with queues...") GPI.queues.add(job.resubmit) else: addInfoString(self, "Attempting job re-submission...") job.resubmit() def update(self): """Update the unit and (re)submit jobs as required""" #logger.warning("Entered Unit %d update function..." % self.getID()) # if we're complete, then just return if self.status in ["completed", "recreating"] or not self.active: return 0 # check if submission is needed task = self._getParent()._getParent() trf = self._getParent() maxsub = task.n_tosub() # check parent unit(s) req_ok = self.checkParentUnitsAreComplete() # set the start time if not already set if len(self.req_units) > 0 and req_ok and self.start_time == 0: self.start_time = time.time() + trf.chain_delay * 60 - 1 if req_ok and self.checkForSubmission() and maxsub > 0: # create job and submit addInfoString(self, "Creating Job...") j = self.createNewJob() if j.name == '': j.name = "T%i:%i U%i" % (task.id, trf.getID(), self.getID()) try: if trf.submit_with_threads: addInfoString(self, "Attempting job submission with queues...") GPI.queues.add(j.submit) else: addInfoString(self, "Attempting job submission...") j.submit() except Exception as err: logger.debug("update Err: %s" % str(err)) addInfoString(self, "Failed Job Submission") addInfoString(self, "Reason: %s" % (formatTraceback())) logger.error("Couldn't submit the job. Deactivating unit.") self.prev_job_ids.append(j.id) self.active = False trf._setDirty() # ensure everything's saved return 1 self.active_job_ids.append(j.id) self.updateStatus("running") trf._setDirty() # ensure everything's saved if trf.submit_with_threads: return 0 return 1 # update any active jobs for jid in self.active_job_ids: # we have an active job so see if this job is OK and resubmit if # not try: job = GPI.jobs(jid) except Exception as err: logger.debug("Update2 Err: %s" % str(err)) logger.warning( "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue if job.status == "completed": # check if actually completed if not self.checkCompleted(job): return 0 # check for DS copy if trf.unit_copy_output: if not self.copy_output: trf.createUnitCopyOutputDS(self.getID()) if not self.copyOutput(): return 0 # check for merger if trf.unit_merger: if not self.merger: self.merger = trf.createUnitMerger(self.getID()) if not self.merge(): return 0 # all good so mark unit as completed self.updateStatus("completed") elif job.status == "failed" or job.status == "killed": # check for too many resubs if self.minor_resub_count + self.major_resub_count > trf.run_limit - 1: logger.error( "Too many resubmits (%i). Deactivating unit." % (self.minor_resub_count + self.major_resub_count)) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.major_resub_count)) self.active = False return 0 rebroker = False if self.minor_resub_count > trf.minor_run_limit - 1: if self._getParent().rebroker_on_job_fail: rebroker = True else: logger.error( "Too many minor resubmits (%i). Deactivating unit." % self.minor_resub_count) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.minor_resub_count)) self.active = False return 0 if self.major_resub_count > trf.major_run_limit - 1: logger.error( "Too many major resubmits (%i). Deactivating unit." % self.major_resub_count) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.major_resub_count)) self.active = False return 0 # check the type of resubmit if rebroker or self.checkMajorResubmit(job): self.major_resub_count += 1 self.minor_resub_count = 0 try: addInfoString(self, "Attempting major resubmit...") self.majorResubmit(job) except Exception as err: logger.debug("Update Err3: %s" % str(err)) logger.error( "Couldn't resubmit the job. Deactivating unit.") addInfoString(self, "Failed Job resubmission") addInfoString(self, "Reason: %s" % (formatTraceback())) self.active = False # break the loop now because we've probably changed the # active jobs list return 1 else: self.minor_resub_count += 1 try: addInfoString(self, "Attempting minor resubmit...") self.minorResubmit(job) except Exception as err: logger.debug("Update Err4: %s" % str(err)) logger.error( "Couldn't resubmit the job. Deactivating unit.") addInfoString(self, "Failed Job resubmission") addInfoString(self, "Reason: %s" % (formatTraceback())) self.active = False return 1 def reset(self): """Reset the unit completely""" addInfoString(self, "Reseting Unit...") self.minor_resub_count = 0 self.major_resub_count = 0 if len(self.active_job_ids) > 0: self.prev_job_ids += self.active_job_ids self.active_job_ids = [] self.active = True # if has parents, set to recreate if len(self.req_units) > 0: self.updateStatus("recreating") else: self.updateStatus("running") # Info routines def n_active(self): if self.status == 'completed': return 0 tot_active = 0 active_states = ['submitted', 'running'] for jid in self.active_job_ids: try: job = GPI.jobs(jid) except Exception as err: logger.debug("n_active Err: %s" % str(err)) task = self._getParent()._getParent() trf = self._getParent() logger.warning( "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue j = stripProxy(job) # try to preserve lazy loading if hasattr(j, 'getNodeIndexCache') and j.getNodeIndexCache( ) and 'subjobs:status' in j.getNodeIndexCache(): if len(j.getNodeIndexCache()['subjobs:status']) > 0: for sj_stat in j.getNodeIndexCache()['subjobs:status']: if sj_stat in active_states: tot_active += 1 else: if j.getNodeIndexCache()['status'] in active_states: tot_active += 1 else: #logger.warning("WARNING: (active check) No index cache for job object %d" % jid) if j.status in active_states: if j.subjobs: for sj in j.subjobs: if sj.status in active_states: tot_active += 1 else: tot_active += 1 return tot_active def n_status(self, status): tot_active = 0 for jid in self.active_job_ids: try: job = GPI.jobs(jid) except Exception as err: logger.debug("n_status Err: %s" % str(err)) task = self._getParent()._getParent() trf = self._getParent() logger.warning( "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue j = stripProxy(job) # try to preserve lazy loading if hasattr(j, 'getNodeIndexCache') and j.getNodeIndexCache( ) and 'subjobs:status' in j.getNodeIndexCache(): if len(j.getNodeIndexCache()['subjobs:status']) > 0: for sj_stat in j.getNodeIndexCache()['subjobs:status']: if sj_stat == status: tot_active += 1 else: if j.getNodeIndexCache()['status'] == status: tot_active += 1 else: #logger.warning("WARNING: (status check) No index cache for job object %d" % jid) if j.subjobs: for sj in j.subjobs: if sj.status == status: tot_active += 1 else: if j.status == status: tot_active += 1 return tot_active def n_all(self): total = 0 for jid in self.active_job_ids: try: job = GPI.jobs(jid) except Exception as err: logger.debug("n_all Err: %s" % str(err)) task = self._getParent()._getParent() trf = self._getParent() logger.warning( "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue j = stripProxy(job) # try to preserve lazy loading if hasattr(j, 'getNodeIndexCache') and j.getNodeIndexCache( ) and 'subjobs:status' in j.getNodeIndexCache(): if len(j.getNodeIndexCache()['subjobs:status']) != 0: total += len(j.getNodeIndexCache()['subjobs:status']) else: total += 1 else: #logger.warning("WARNING: (status check) No index cache for job object %d" % jid) if j.subjobs: total = len(j.subjobs) else: total = 1 return total def overview(self): """Print an overview of this unit""" o = " Unit %d: %s " % (self.getID(), self.name) for s in ["submitted", "running", "completed", "failed", "unknown"]: o += markup("%i " % self.n_status(s), overview_colours[s]) print(o) def copyOutput(self): """Copy any output to the given dataset""" logger.error( "No default implementation for Copy Output - contact plugin developers" ) return False
class GaudiPython(GaudiBase): """The GaudiPython Application handler The GaudiPython application handler is for running LHCb GaudiPython jobs. This means running scripts where you are in control of the events loop etc. If you are usually running jobs using the gaudirun script this is *not* the application handler you should use. Instead use the DaVinci, Gauss, ... handlers. For its configuration it needs to know what application and version to use for setting up the environment. More detailed configuration options are described in the schema below. An example of submitting a GaudiPython job to Dirac could be: app = GaudiPython(project='DaVinci', version='v19r14') # Give absolute path to the python file to be executed. # If several files are given the subsequent ones will go into the # sandbox but it is the users responsibility to include them app.script = ['/afs/...../myscript.py'] # Define dataset ds = LHCbDataset(['LFN:spam','LFN:eggs']) # Construct and submit job object j=Job(application=app,backend=Dirac(),inputdata=ds) j.submit() """ _name = 'GaudiPython' _category = 'applications' _exportmethods = GaudiBase._exportmethods[:] _exportmethods += ['prepare', 'unprepare'] _schema = GaudiBase._schema.inherit_copy() docstr = 'The package the application belongs to (e.g. "Sim", "Phys")' _schema.datadict['package'] = SimpleItem(defvalue=None, typelist=['str', 'type(None)'], doc=docstr) docstr = 'The package where your top level requirements file is read ' \ 'from. Can be written either as a path ' \ '\"Tutorial/Analysis/v6r0\" or in traditional notation ' \ '\"Analysis v6r0 Tutorial\"' _schema.datadict['masterpackage'] = SimpleItem( defvalue=None, typelist=['str', 'type(None)'], doc=docstr) docstr = 'Extra options to be passed onto the SetupProject command '\ 'used for configuring the environment. As an example '\ 'setting it to \'--dev\' will give access to the DEV area. '\ 'For full documentation of the available options see '\ 'https://twiki.cern.ch/twiki/bin/view/LHCb/SetupProject' _schema.datadict['setupProjectOptions'] = SimpleItem( defvalue='', typelist=['str', 'type(None)'], doc=docstr) docstr = 'The name of the script to execute. A copy will be made ' + \ 'at submission time' _schema.datadict['script'] = FileItem(preparable=1, sequence=1, strict_sequence=0, defvalue=[], doc=docstr) docstr = "List of arguments for the script" _schema.datadict['args'] = SimpleItem(defvalue=[], typelist=['str'], sequence=1, doc=docstr) docstr = 'The name of the Gaudi application (e.g. "DaVinci", "Gauss"...)' _schema.datadict['project'] = SimpleItem(preparable=1, defvalue=None, typelist=['str', 'type(None)'], doc=docstr) _schema.version.major += 2 _schema.version.minor += 0 def _get_default_version(self, gaudi_app): return guess_version(self, gaudi_app) def _attribute_filter__set__(self, n, v): if n == 'project': self.appname = v return v def _auto__init__(self): if (not self.appname) and (not self.project): self.project = 'DaVinci' # default if (not self.appname): self.appname = self.project self._init() def _getshell(self): import EnvironFunctions return EnvironFunctions._getshell(self) def prepare(self, force=False): super(GaudiPython, self).prepare(force) self._check_inputs() share_dir = os.path.join( expandfilename(getConfig('Configuration')['gangadir']), 'shared', getConfig('Configuration')['user'], self.is_prepared.name) fillPackedSandbox( self.script, os.path.join(share_dir, 'inputsandbox', '_input_sandbox_%s.tar' % self.is_prepared.name)) gzipFile( os.path.join(share_dir, 'inputsandbox', '_input_sandbox_%s.tar' % self.is_prepared.name), os.path.join(share_dir, 'inputsandbox', '_input_sandbox_%s.tgz' % self.is_prepared.name), True) # add the newly created shared directory into the metadata system if # the app is associated with a persisted object self.checkPreparedHasParent(self) self.post_prepare() def master_configure(self): return (None, StandardJobConfig()) def configure(self, master_appconfig): # self._configure() name = join('.', self.script[0].subdir, split(self.script[0].name)[-1]) script = "from Gaudi.Configuration import *\n" if self.args: script += 'import sys\nsys.argv += %s\n' % str(self.args) script += "importOptions('data.py')\n" script += "execfile(\'%s\')\n" % name # add summary.xml outputsandbox_temp = XMLPostProcessor._XMLJobFiles() outputsandbox_temp += unique(self.getJobObject().outputsandbox) outputsandbox = unique(outputsandbox_temp) input_files = [] input_files += [FileBuffer('gaudipython-wrapper.py', script)] logger.debug("Returning Job Configuration") return (None, StandardJobConfig(inputbox=input_files, outputbox=outputsandbox)) def _check_inputs(self): """Checks the validity of user's entries for GaudiPython schema""" if len(self.script) == 0: logger.warning("No script defined. Will use a default " 'script which is probably not what you want.') self.script = [ File( os.path.join( os.path.dirname(inspect.getsourcefile(GaudiPython)), 'options/GaudiPythonExample.py')) ] else: for f in self.script: f.name = fullpath(f.name) return def postprocess(self): XMLPostProcessor.postprocess(self, logger)
class _DSTMergeTool(IMergeTool): _category = 'merge_tools' _hidden = 1 _name = '_DSTMergeTool' _schema = IMergeTool._schema.inherit_copy() docstr = 'Path to a options file to use when merging.' _schema.datadict['merge_opts'] = FileItem(defvalue=None,doc=docstr) docstr = 'The version of DaVinci to use when merging. (e.g. v19r14)' _schema.datadict['version'] = SimpleItem(defvalue='',doc=docstr) def selectOptionsFile(self, version_string): """Trys to find the correct version of the optsions file to use based on the version.""" dir = os.path.dirname(inspect.getsourcefile(_DSTMergeTool)) options_dir = os.path.join(dir,'options') #search for the version of the merge opts which most closly matches 'version' import glob files = glob.glob(options_dir + os.path.sep + 'DSTMerger*.opts') #try to find the best options file to use opts_files = {} for f in files: file_name = os.path.basename(f) v = None #remove the .opts part if file_name.endswith('.opts'): file_name = file_name[0:-5] #remove the DSTMerger bit if file_name.startswith('DSTMerger-'): file_name = file_name[10:] if file_name: v = CMTVersion(file_name) else: v = CMTVersion() opts_files[v] = f #the result to return opts_file = None #itterate over the versions in order keys = opts_files.keys() keys.sort() saved = keys[-1]#default is latest one if version_string: version = CMTVersion(version_string) for k in keys: if version < k: break else: saved = k opts_file = opts_files[saved] return opts_file def mergefiles(self, file_list, output_file): #if no opts file is specified, then use version from installation if self.merge_opts is None or not self.merge_opts.name: self.merge_opts = File(self.selectOptionsFile(self.version)) if not os.path.exists(self.merge_opts.name): msg = "The options file '%s' needed for merging does not exist." raise MergerError(msg % self.merge_opts.name) logger.info("Using the options file '%s'.", self.merge_opts.name) #this is the bit specifing the files output_opts = """ // the output file from the merge InputCopyStream.Output = "DATAFILE='PFN:%s' TYP='POOL_ROOTTREE' OPT='REC'"; //the files to merge EventSelector.Input = {""" % output_file file_sep = ',' file_len = len(file_list) for i in xrange(file_len): file_name = file_list[i] if i == (file_len - 1): file_sep = '' #i.e. last entry does not have a comma output_opts += """ "DATAFILE='PFN:%s' TYP='POOL_ROOTTREE' OPT='READ'"%s""" % (file_name, file_sep) output_opts += """ };""" #write this out to a file opts_file_name = tempfile.mktemp('.opts') try: opts_file = file(opts_file_name,'w') opts_file.write(output_opts) finally: opts_file.close() if not os.path.exists(opts_file_name): msg = "Failed to write tempory options file '%s' during merge" raise MergerError(msg % opts_file_name) #now run gaudirun via a script shell_script = """#!/bin/sh SP=`which SetupProject.sh` if [ -n $SP ]; then . SetupProject.sh --ignore-missing DaVinci %s else echo "Could not find the SetupProject.sh script. Your job will probably fail" fi gaudirun.py %s %s exit $? """ % (self.version, self.merge_opts.name, opts_file_name) script_file_name = tempfile.mktemp('.sh') try: script_file = file(script_file_name,'w') script_file.write(shell_script) finally: script_file.close() return_code = subprocess.call(['/bin/sh',script_file_name]) if return_code != 0: msg = 'The DSTMerger returned %i when calling gaudirun' logger.warning(msg % return_code) #finally clean up os.unlink(script_file_name) os.unlink(opts_file_name) if not os.path.exists(output_file): msg = "The output file '%s' was not created" raise MergerError(msg % output_file)