def updateStatus(self, status): "if we've just been switched to running, the job has been submitted so update the DB" if (status == "running") and (self.status == "new") and len(self.active_job_ids) != 0: ins_fields = "run,description,decay_type,radcor,mc_version,seed,events,output_name,jdl,mac,exe,stderr,stdout,status_url,submitter,submitted_on,status" #(submitted_on format is "2013-MM-DD HH:mm:ss") app = getJobByID(self.active_job_ids[0]).application ins_vals = "%d, '%s production job', %d, %d, %d, %d, %d, '%s_v%d_r%d.root', '__jdlfile__', '%s', '%s', 'na62run%d.err', 'na62run%d.out', '%s', 'ganga', '%s', 'SUBMITTED'" % ( app.run_number, app.decay_name, app.decay_type, app.radcor, app.mc_version, app.run_number, app.num_events, app.file_prefix, app.mc_version, app.run_number, app._impl.getMACFileName(), app.script_name, app.run_number, app.run_number, getJobByID(self.active_job_ids[0]).backend.id, datetime.now().strftime("%Y-%m-%d %H:%m:%S")) nec_file = ".gpytho" work_dir = "/clusterhome/home/protopop" nec_str = open(os.path.join(work_dir, nec_file)).read().strip().strip('#') mysqlc = "mysql -hhughnon.ppe.gla.ac.uk -ugridbot -p%s -s GridJobs" % nec_str rc, out = getstatusoutput( "echo \"INSERT INTO jobs (%s) VALUES (%s)\" | %s" % (ins_fields, ins_vals, mysqlc)) if (rc != 0): logger.error(out)
def getJobs(self): """ Get the job slice of all jobs that process this task """ jobslice = JobRegistrySlice("tasks(%i).getJobs()" % (self.id)) for trf in self.transforms: for jid in trf.getJobs(): jobslice.objects[getJobByID(jid).fqid] = stripProxy(getJobByID(jid)) return JobRegistrySliceProxy(jobslice)
def getJobs(self): """ Get the job slice of all jobs that process this task """ jobslice = JobRegistrySlice("tasks(%i).getJobs()" % (self.id)) for trf in self.transforms: for jid in trf.getJobs(): jobslice.objects[getJobByID(jid).fqid] = stripProxy( getJobByID(jid)) return JobRegistrySliceProxy(jobslice)
def n_all(self): total = 0 for jid in self.active_job_ids: try: job = getJobByID(jid) except Exception as err: logger.debug("n_all Err: %s" % str(err)) task = self._getParent()._getParent() trf = self._getParent() logger.warning( "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue j = stripProxy(job) # try to preserve lazy loading if hasattr( j, '_index_cache' ) and j._index_cache and 'subjobs:status' in j._index_cache: if len(j._index_cache['subjobs:status']) != 0: total += len(j._index_cache['subjobs:status']) else: total += 1 else: #logger.warning("WARNING: (status check) No index cache for job object %d" % jid) if j.subjobs: total = len(j.subjobs) else: total = 1 return total
def n_all(self): total = 0 for jid in self.active_job_ids: try: job = getJobByID(jid) except Exception as err: logger.debug("n_all Err: %s" % str(err)) task = self._getParent()._getParent() trf = self._getParent() logger.warning("Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue j = stripProxy(job) # try to preserve lazy loading if hasattr(j, '_index_cache') and j._index_cache and 'subjobs:status' in j._index_cache: if len(j._index_cache['subjobs:status']) != 0: total += len(j._index_cache['subjobs:status']) else: total += 1 else: #logger.warning("WARNING: (status check) No index cache for job object %d" % jid) if j.subjobs: total = len(j.subjobs) else: total = 1 return total
def updateStatus(self, status): """Update status hook""" # check for input data deletion of chain data if status == "completed" and self._getParent( ).delete_chain_input and len(self.req_units) > 0: # the inputdata field *must* be filled from the parent task # NOTE: When changing to inputfiles, will probably need to check # for any specified in trf.inputfiles # check that the parent replicas have been copied by checking # backend status == Done job_list = [] for req_unit in self.req_units: trf = self._getParent()._getParent().transforms[int( req_unit.split(":")[0])] req_unit_id = req_unit.split(":")[1] if req_unit_id != "ALL": unit = trf.units[int(req_unit_id)] job_list.append(getJobByID(unit.active_job_ids[0])) else: for unit in trf.units: job_list.append(getJobByID(unit.active_job_ids[0])) for j in job_list: if j.subjobs: for sj in j.subjobs: if sj.backend.status != "Done": return else: if j.backend.status != "Done": return job = getJobByID(self.active_job_ids[0]) for f in job.inputdata.files: # check for an lfn if hasattr(f, "lfn"): fname = f.lfn else: fname = f.namePattern logger.warning("Removing chain inputdata file '%s'..." % fname) f.remove() super(LHCbUnit, self).updateStatus(status)
def remove(self, remove_jobs="do_nothing"): """Delete the task""" # make sure the task isn't running if self.status.find("running") != -1: logger.error( "Task is still running. Please pause before removing!") return if not remove_jobs in [True, False]: logger.info("You want to remove the task %i named '%s'." % (self.id, self.name)) logger.info( "Since this operation cannot be easily undone, please call this command again:" ) logger.info( " * as tasks(%i).remove(remove_jobs=True) if you want to remove all associated jobs," % (self.id)) logger.info( " * as tasks(%i).remove(remove_jobs=False) if you want to keep the jobs." % (self.id)) return if remove_jobs: for trf in self.transforms: for unit in trf.units: for jid in unit.active_job_ids: try: j = getJobByID(jid) j.remove() except Exception as err: logger.debug("Remove Err: %s" % str(err)) pass for jid in unit.prev_job_ids: try: j = getJobByID(jid) j.remove() except Exception as err2: logger.debug("Remove Err2: %s" % str(err2)) pass self._getRegistry()._remove(self, auto_removed=1) logger.info("Task #%s deleted" % self.id)
def getParentUnitJobs(self, parent_units, include_subjobs=True): """Return the list of parent jobs""" job_list = [] for parent in parent_units: job = getJobByID(parent.active_job_ids[0]) if job.subjobs: job_list += job.subjobs else: job_list += [job] return job_list
def removeUnusedJobs(self): """Remove all jobs that aren't being used, e.g. failed jobs""" for unit in self.units: for jid in unit.prev_job_ids: try: logger.warning("Removing job '%d'..." % jid) job = getJobByID(jid) job.remove() except Exception as err: logger.debug("removeUnused: %s" % str(err)) logger.error("Problem removing job '%d'" % jid)
def createChainUnit(self, parent_units, use_copy_output=True): """Create an output unit given this output data""" # we need a parent job that has completed to get the output files incl_pat_list = [] excl_pat_list = [] for parent in parent_units: if len(parent.active_job_ids) == 0 or parent.status != "completed": return None for inds in self.inputdata: from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput if isType( inds, TaskChainInput ) and inds.input_trf_id == parent._getParent().getID(): incl_pat_list += inds.include_file_mask excl_pat_list += inds.exclude_file_mask # go over the output files and copy the appropriates over as input # files flist = [] import re for parent in parent_units: job = getJobByID(parent.active_job_ids[0]) if job.subjobs: job_list = job.subjobs else: job_list = [job] for sj in job_list: for f in sj.outputfiles: # match any dirac files that are allowed in the file mask if isType(f, DiracFile): if len(incl_pat_list) > 0: for pat in incl_pat_list: if re.search(pat, f.lfn): flist.append("LFN:" + f.lfn) else: flist.append("LFN:" + f.lfn) if len(excl_pat_list) > 0: for pat in excl_pat_list: if re.search( pat, f.lfn) and "LFN:" + f.lfn in flist: flist.remove("LFN:" + f.lfn) # just do one unit that uses all data unit = LHCbUnit() unit.name = "Unit %d" % len(self.units) unit.inputdata = LHCbDataset(files=[DiracFile(lfn=f) for f in flist]) return unit
def remove(self, remove_jobs="do_nothing"): """Delete the task""" # make sure the task isn't running if self.status.find("running") != -1: logger.error( "Task is still running. Please pause before removing!") return if not remove_jobs in [True, False]: logger.info("You want to remove the task %i named '%s'." % (self.id, self.name)) logger.info( "Since this operation cannot be easily undone, please call this command again:") logger.info( " * as tasks(%i).remove(remove_jobs=True) if you want to remove all associated jobs," % (self.id)) logger.info( " * as tasks(%i).remove(remove_jobs=False) if you want to keep the jobs." % (self.id)) return if remove_jobs: for trf in self.transforms: for unit in trf.units: for jid in unit.active_job_ids: try: j = getJobByID(jid) j.remove() except Exception as err: logger.debug("Remove Err: %s" % str(err)) pass for jid in unit.prev_job_ids: try: j = getJobByID(jid) j.remove() except Exception as err2: logger.debug("Remove Err2: %s" % str(err2)) pass self._getRegistry()._remove(self, auto_removed=1) logger.info("Task #%s deleted" % self.id)
def checkForResubmission(self): """check if this unit should be resubmitted""" # check if we already have a job if len(self.active_job_ids) == 0: return False else: job = getJobByID(self.active_job_ids[0]) if job.status in ["failed", "killed"]: return True return False
def createChainUnit( self, parent_units, use_copy_output = True ): """Create a chained unit using the output data from the given units""" # check all parent units for copy_output copy_output_ok = True for parent in parent_units: if not parent.copy_output: copy_output_ok = False # all parent units must be completed so the outputfiles are filled correctly for parent in parent_units: if parent.status != "completed": return None if len(parent_units) == 0: return None if not use_copy_output or not copy_output_ok: unit = ND280Unit() unit.inputdata = ND280LocalDataset() for parent in parent_units: # loop over the output files and add them to the ND280LocalDataset - THIS MIGHT NEED SOME WORK! job = getJobByID(parent.active_job_ids[0]) for f in job.outputfiles: # should check for different file types and add them as appropriate to the dataset # self.inputdata (== TaskChainInput).include/exclude_file_mask could help with this # This will be A LOT easier with Ganga 6.1 as you can easily map outputfiles -> inputfiles! # TODO: implement use of include/exclude_file_mask # try: outputfilenameformat = f.outputfilenameformat except: inputdir = job.outputdir else: #### WARNING: The following will work only if the MassStorageFile puts the files in local directories ! inputdir = '/'.join( [getConfig('Output')['MassStorageFile']['uploadOptions']['path'], f.outputfilenameformat.replace('{fname}','')]) unit.inputdata.get_dataset( inputdir, f.namePattern ) else: unit = ND280Unit() unit.inputdata = ND280LocalDataset() for parent in parent_units: # unit needs to have completed and downloaded before we can get file list if parent.status != "completed": return None # we should be OK so copy all output to the dataset for f in parent.copy_output.files: unit.inputdata.names.append( os.path.join( parent.copy_output.local_location, f ) ) return unit
def createChainUnit(self, parent_units, use_copy_output=True): """Create an output unit given this output data""" # we need a parent job that has completed to get the output files incl_pat_list = [] excl_pat_list = [] for parent in parent_units: if len(parent.active_job_ids) == 0 or parent.status != "completed": return None for inds in self.inputdata: from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput if isType(inds, TaskChainInput) and inds.input_trf_id == parent._getParent().getID(): incl_pat_list += inds.include_file_mask excl_pat_list += inds.exclude_file_mask # go over the output files and copy the appropriates over as input # files flist = [] import re for parent in parent_units: job = getJobByID(parent.active_job_ids[0]) if job.subjobs: job_list = job.subjobs else: job_list = [job] for sj in job_list: for f in sj.outputfiles: # match any dirac files that are allowed in the file mask if isType(f, DiracFile): if len(incl_pat_list) > 0: for pat in incl_pat_list: if re.search(pat, f.lfn): flist.append("LFN:" + f.lfn) else: flist.append("LFN:" + f.lfn) if len(excl_pat_list) > 0: for pat in excl_pat_list: if re.search(pat, f.lfn) and "LFN:" + f.lfn in flist: flist.remove("LFN:" + f.lfn) # just do one unit that uses all data unit = LHCbUnit() unit.name = "Unit %d" % len(self.units) unit.inputdata = LHCbDataset(files=[DiracFile(lfn=f) for f in flist]) return unit
def updateStatus(self, status): "if we've just been switched to running, the job has been submitted so update the DB" if (status == "running") and (self.status == "new") and len(self.active_job_ids) != 0: ins_fields = "run,description,decay_type,radcor,mc_version,seed,events,output_name,jdl,mac,exe,stderr,stdout,status_url,submitter,submitted_on,status" # (submitted_on format is "2013-MM-DD HH:mm:ss") app = getJobByID(self.active_job_ids[0]).application ins_vals = ( "%d, '%s production job', %d, %d, %d, %d, %d, '%s_v%d_r%d.root', '__jdlfile__', '%s', '%s', 'na62run%d.err', 'na62run%d.out', '%s', 'ganga', '%s', 'SUBMITTED'" % ( app.run_number, app.decay_name, app.decay_type, app.radcor, app.mc_version, app.run_number, app.num_events, app.file_prefix, app.mc_version, app.run_number, app._impl.getMACFileName(), app.script_name, app.run_number, app.run_number, getJobByID(self.active_job_ids[0]).backend.id, datetime.now().strftime("%Y-%m-%d %H:%m:%S"), ) ) nec_file = ".gpytho" work_dir = "/clusterhome/home/protopop" nec_str = open(os.path.join(work_dir, nec_file)).read().strip().strip("#") mysqlc = "mysql -hhughnon.ppe.gla.ac.uk -ugridbot -p%s -s GridJobs" % nec_str rc, out = getstatusoutput('echo "INSERT INTO jobs (%s) VALUES (%s)" | %s' % (ins_fields, ins_vals, mysqlc)) if rc != 0: logger.error(out)
def n_active(self): if self.status == 'completed': return 0 tot_active = 0 active_states = ['submitted', 'running'] for jid in self.active_job_ids: try: job = getJobByID(jid) except Exception as err: logger.debug("n_active Err: %s" % str(err)) task = self._getParent()._getParent() trf = self._getParent() logger.warning( "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue j = stripProxy(job) # try to preserve lazy loading if hasattr( j, '_index_cache' ) and j._index_cache and 'subjobs:status' in j._index_cache: if len(j._index_cache['subjobs:status']) > 0: for sj_stat in j._index_cache['subjobs:status']: if sj_stat in active_states: tot_active += 1 else: if j._index_cache['status'] in active_states: tot_active += 1 else: #logger.warning("WARNING: (active check) No index cache for job object %d" % jid) if j.status in active_states: if j.subjobs: for sj in j.subjobs: if sj.status in active_states: tot_active += 1 else: tot_active += 1 return tot_active
def n_active(self): if self.status == 'completed': return 0 tot_active = 0 active_states = ['submitted', 'running'] for jid in self.active_job_ids: try: job = getJobByID(jid) except Exception as err: logger.debug("n_active Err: %s" % str(err)) task = self._getParent()._getParent() trf = self._getParent() logger.warning("Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue j = stripProxy(job) # try to preserve lazy loading if hasattr(j, '_index_cache') and j._index_cache and 'subjobs:status' in j._index_cache: if len(j._index_cache['subjobs:status']) > 0: for sj_stat in j._index_cache['subjobs:status']: if sj_stat in active_states: tot_active += 1 else: if j._index_cache['status'] in active_states: tot_active += 1 else: #logger.warning("WARNING: (active check) No index cache for job object %d" % jid) if j.status in active_states: if j.subjobs: for sj in j.subjobs: if sj.status in active_states: tot_active += 1 else: tot_active += 1 return tot_active
def removeUnusedData(self): """Remove any output data from orphaned jobs""" for unit in self.units: for jid in unit.prev_job_ids: try: logger.warning("Removing data from job '%d'..." % jid) job = getJobByID(jid) jlist = [] if len(job.subjobs) > 0: jlist = job.subjobs else: jlist = [job] for sj in jlist: for f in sj.outputfiles: if isType(f, DiracFile) == "DiracFile" and f.lfn: f.remove() except: logger.error("Problem deleting data for job '%d'" % jid) pass
def createChainUnit( self, parent_units, use_copy_output = True ): """Create a chained unit using the output data from the given units""" # check all parent units for copy_output copy_output_ok = True for parent in parent_units: if not parent.copy_output: copy_output_ok = False # all parent units must be completed so the outputfiles are filled correctly for parent in parent_units: if parent.status != "completed": return None if not use_copy_output or not copy_output_ok: unit = ND280Unit_CSVEvtList() unit.inputdata = ND280LocalDataset() for parent in parent_units: # loop over the output files and add them to the ND280LocalDataset - THIS MIGHT NEED SOME WORK! job = getJobByID(parent.active_job_ids[0]) for f in job.outputfiles: # should check for different file types and add them as appropriate to the dataset # self.inputdata (== TaskChainInput).include/exclude_file_mask could help with this # This will be A LOT easier with Ganga 6.1 as you can easily map outputfiles -> inputfiles! unit.inputdata.names.append( os.path.join( job.outputdir, f.namePattern ) ) else: unit = ND280Unit_CSVEvtList() unit.inputdata = ND280LocalDataset() for parent in parent_units: # unit needs to have completed and downloaded before we can get file list if parent.status != "completed": return None # we should be OK so copy all output to the dataset for f in parent.copy_output.files: unit.inputdata.names.append( os.path.join( parent.copy_output.local_location, f ) ) return unit
def updateQuery(self, resubmit=False): """Update the dataset information of the transforms. This will include any new data in the processing or re-run jobs that have data which has been removed.""" if len(self.queries) == 0: raise GangaException( None, 'Cannot call updateQuery() on an LHCbTransform without any queries') if self._getParent() != None: logger.info('Retrieving latest bookkeeping information for transform %i:%i, please wait...' % ( self._getParent().id, self.getID())) else: logger.info( 'Retrieving latest bookkeeping information for transform, please wait...') # check we have an input DS per BK Query while len(self.queries) > len(self.inputdata): self.inputdata.append(LHCbDataset()) # loop over the queries and add fill file lists for id, query in enumerate(self.queries): # Get the latest dataset latest_dataset = query.getDataset() # Compare to previous inputdata, get new and removed logger.info( 'Checking for new and removed data for query %d, please wait...' % self.queries.index(query)) dead_data = LHCbDataset() new_data = LHCbDataset() # loop over the old data and compare new_data.files += latest_dataset.difference( self.inputdata[id]).files dead_data.files += self.inputdata[ id].difference(latest_dataset).files # for dead data, find then kill/remove any associated jobs # loop over units and check any associated with this DS # TODO: Follow through chained tasks for unit in self.units: # associted unit if unit.input_datset_index != id: continue # find the job if len(unit.active_job_ids) == 0: continue # check the data for f in dead_data.files: if f in unit.inputdata.files: # kill the job job = getJobByID(unit.active_job_ids[0]) if job.status in ['submitted', 'running']: job.kill() # forget the job unit.prev_job_ids.append(unit.active_job_ids[0]) unit.active_job_ids = [] break # in any case, now just set the DS files to the new set self.inputdata[id].files = [] self.inputdata[id].files = latest_dataset.files
def updateQuery(self, resubmit=False): """Update the dataset information of the transforms. This will include any new data in the processing or re-run jobs that have data which has been removed.""" if len(self.queries) == 0: raise GangaException( None, 'Cannot call updateQuery() on an LHCbTransform without any queries' ) if self._getParent() != None: logger.info( 'Retrieving latest bookkeeping information for transform %i:%i, please wait...' % (self._getParent().id, self.getID())) else: logger.info( 'Retrieving latest bookkeeping information for transform, please wait...' ) # check we have an input DS per BK Query while len(self.queries) > len(self.inputdata): self.inputdata.append(LHCbDataset()) # loop over the queries and add fill file lists for id, query in enumerate(self.queries): # Get the latest dataset latest_dataset = query.getDataset() # Compare to previous inputdata, get new and removed logger.info( 'Checking for new and removed data for query %d, please wait...' % self.queries.index(query)) dead_data = LHCbDataset() new_data = LHCbDataset() # loop over the old data and compare new_data.files += latest_dataset.difference( self.inputdata[id]).files dead_data.files += self.inputdata[id].difference( latest_dataset).files # for dead data, find then kill/remove any associated jobs # loop over units and check any associated with this DS # TODO: Follow through chained tasks for unit in self.units: # associted unit if unit.input_datset_index != id: continue # find the job if len(unit.active_job_ids) == 0: continue # check the data for f in dead_data.files: if f in unit.inputdata.files: # kill the job job = getJobByID(unit.active_job_ids[0]) if job.status in ['submitted', 'running']: job.kill() # forget the job unit.prev_job_ids.append(unit.active_job_ids[0]) unit.active_job_ids = [] break # in any case, now just set the DS files to the new set self.inputdata[id].files = [] self.inputdata[id].files = latest_dataset.files
def update(self): """Update the unit and (re)submit jobs as required""" # if we're complete, then just return if self.status in ["completed", "recreating"] or not self.active: return 0 # check if submission is needed task = self._getParent()._getParent() trf = self._getParent() maxsub = task.n_tosub() # check parent unit(s) req_ok = self.checkParentUnitsAreComplete() # set the start time if not already set if len(self.req_units) > 0 and req_ok and self.start_time == 0: self.start_time = time.time() + trf.chain_delay * 60 - 1 if req_ok and self.checkForSubmission() and maxsub > 0: # create job and submit addInfoString(self, "Creating Job...") j = self.createNewJob() if j.name == '': j.name = "T%i:%i U%i" % (task.id, trf.getID(), self.getID()) try: if trf.submit_with_threads: addInfoString(self, "Attempting job submission with queues...") from Ganga.Core.GangaThread.WorkerThreads import getQueues getQueues().add(j.submit) else: addInfoString(self, "Attempting job submission...") j.submit() except Exception as err: logger.debug("update Err: %s" % str(err)) addInfoString(self, "Failed Job Submission") addInfoString(self, "Reason: %s" % (formatTraceback())) logger.error("Couldn't submit the job. Deactivating unit.") self.prev_job_ids.append(j.id) self.active = False trf._setDirty() # ensure everything's saved return 1 self.active_job_ids.append(j.id) self.updateStatus("running") trf._setDirty() # ensure everything's saved if trf.submit_with_threads: return 0 return 1 # update any active jobs for jid in self.active_job_ids: # we have an active job so see if this job is OK and resubmit if # not try: job = getJobByID(jid) except Exception as err: logger.debug("Update2 Err: %s" % str(err)) logger.warning( "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue if job.status == "completed": # check if actually completed if not self.checkCompleted(job): return 0 # check for DS copy if trf.unit_copy_output: if not self.copy_output: trf.createUnitCopyOutputDS(self.getID()) if not self.copyOutput(): return 0 # check for merger if trf.unit_merger: if not self.merger: self.merger = trf.createUnitMerger(self.getID()) if not self.merge(): return 0 # all good so mark unit as completed self.updateStatus("completed") elif job.status == "failed" or job.status == "killed": # check for too many resubs if self.minor_resub_count + self.major_resub_count > trf.run_limit - 1: logger.error( "Too many resubmits (%i). Deactivating unit." % (self.minor_resub_count + self.major_resub_count)) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.major_resub_count)) self.active = False return 0 rebroker = False if self.minor_resub_count > trf.minor_run_limit - 1: if self._getParent().rebroker_on_job_fail: rebroker = True else: logger.error( "Too many minor resubmits (%i). Deactivating unit." % self.minor_resub_count) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.minor_resub_count)) self.active = False return 0 if self.major_resub_count > trf.major_run_limit - 1: logger.error( "Too many major resubmits (%i). Deactivating unit." % self.major_resub_count) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.major_resub_count)) self.active = False return 0 # check the type of resubmit if rebroker or self.checkMajorResubmit(job): self.major_resub_count += 1 self.minor_resub_count = 0 try: addInfoString(self, "Attempting major resubmit...") self.majorResubmit(job) except Exception as err: logger.debug("Update Err3: %s" % str(err)) logger.error( "Couldn't resubmit the job. Deactivating unit.") addInfoString(self, "Failed Job resubmission") addInfoString(self, "Reason: %s" % (formatTraceback())) self.active = False # break the loop now because we've probably changed the # active jobs list return 1 else: self.minor_resub_count += 1 try: addInfoString(self, "Attempting minor resubmit...") self.minorResubmit(job) except Exception as err: logger.debug("Update Err4: %s" % str(err)) logger.error( "Couldn't resubmit the job. Deactivating unit.") addInfoString(self, "Failed Job resubmission") addInfoString(self, "Reason: %s" % (formatTraceback())) self.active = False return 1
def update(self): """Update the unit and (re)submit jobs as required""" # if we're complete, then just return if self.status in ["completed", "recreating"] or not self.active: return 0 # check if submission is needed task = self._getParent()._getParent() trf = self._getParent() maxsub = task.n_tosub() # check parent unit(s) req_ok = self.checkParentUnitsAreComplete() # set the start time if not already set if len(self.req_units) > 0 and req_ok and self.start_time == 0: self.start_time = time.time() + trf.chain_delay * 60 - 1 if req_ok and self.checkForSubmission() and maxsub > 0: # create job and submit addInfoString( self, "Creating Job..." ) j = self.createNewJob() if j.name == '': j.name = "T%i:%i U%i" % (task.id, trf.getID(), self.getID()) try: if trf.submit_with_threads: addInfoString( self, "Attempting job submission with queues..." ) from Ganga.Core.GangaThread.WorkerThreads import getQueues getQueues().add(j.submit) else: addInfoString( self, "Attempting job submission..." ) j.submit() except Exception as err: logger.debug("update Err: %s" % str(err)) addInfoString( self, "Failed Job Submission") addInfoString( self, "Reason: %s" % (formatTraceback())) logger.error("Couldn't submit the job. Deactivating unit.") self.prev_job_ids.append(j.id) self.active = False trf._setDirty() # ensure everything's saved return 1 self.active_job_ids.append(j.id) self.updateStatus("running") trf._setDirty() # ensure everything's saved if trf.submit_with_threads: return 0 return 1 # update any active jobs for jid in self.active_job_ids: # we have an active job so see if this job is OK and resubmit if # not try: job = getJobByID(jid) except Exception as err: logger.debug("Update2 Err: %s" % str(err)) logger.warning("Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" % (jid, task.id, trf.getID(), self.getID())) continue if job.status == "completed": # check if actually completed if not self.checkCompleted(job): return 0 # check for DS copy if trf.unit_copy_output: if not self.copy_output: trf.createUnitCopyOutputDS(self.getID()) if not self.copyOutput(): return 0 # check for merger if trf.unit_merger: if not self.merger: self.merger = trf.createUnitMerger(self.getID()) if not self.merge(): return 0 # all good so mark unit as completed self.updateStatus("completed") elif job.status == "failed" or job.status == "killed": # check for too many resubs if self.minor_resub_count + self.major_resub_count > trf.run_limit - 1: logger.error("Too many resubmits (%i). Deactivating unit." % ( self.minor_resub_count + self.major_resub_count)) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % ( self.minor_resub_count + self.major_resub_count)) self.active = False return 0 rebroker = False if self.minor_resub_count > trf.minor_run_limit - 1: if self._getParent().rebroker_on_job_fail: rebroker = True else: logger.error( "Too many minor resubmits (%i). Deactivating unit." % self.minor_resub_count) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.minor_resub_count)) self.active = False return 0 if self.major_resub_count > trf.major_run_limit - 1: logger.error( "Too many major resubmits (%i). Deactivating unit." % self.major_resub_count) addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.major_resub_count)) self.active = False return 0 # check the type of resubmit if rebroker or self.checkMajorResubmit(job): self.major_resub_count += 1 self.minor_resub_count = 0 try: addInfoString( self, "Attempting major resubmit...") self.majorResubmit(job) except Exception as err: logger.debug("Update Err3: %s" % str(err)) logger.error("Couldn't resubmit the job. Deactivating unit.") addInfoString( self, "Failed Job resubmission") addInfoString( self, "Reason: %s" % (formatTraceback())) self.active = False # break the loop now because we've probably changed the # active jobs list return 1 else: self.minor_resub_count += 1 try: addInfoString( self, "Attempting minor resubmit...") self.minorResubmit(job) except Exception as err: logger.debug("Update Err4: %s" % str(err)) logger.error("Couldn't resubmit the job. Deactivating unit.") addInfoString( self, "Failed Job resubmission") addInfoString( self, "Reason: %s" % (formatTraceback())) self.active = False return 1