def check(self, job): """ Check that a string is in a file, takes the job object as input. """ if not len(self.searchStrings): raise PostProcessException( 'No searchStrings specified, FileCheckeR will do nothing!') filepaths = self.findFiles(job) if not len(filepaths): raise PostProcessException( 'None of the files to check exist, FileCheckeR will do nothing!' ) for filepath in filepaths: for searchString in self.searchStrings: grepoutput = commands.getoutput('grep "%s" %s' % (searchString, filepath)) if len(grepoutput) and self.failIfFound is True: logger.info( 'The string %s has been found in file %s, FileCheckeR will fail job(%s)', searchString, filepath, job.fqid) return self.failure if not len(grepoutput) and self.failIfFound is False: logger.info( 'The string %s has not been found in file %s, FileCheckeR will fail job(%s)', searchString, filepath, job.fqid) return self.failure return self.result
def check(self, job): """ Check that a string is in a file, takes the job object as input. """ if not len(self.searchStrings): raise PostProcessException( 'No searchStrings specified, FileChecker will do nothing!') filepaths = self.findFiles(job) if not len(filepaths): raise PostProcessException( 'None of the files to check exist, FileChecker will do nothing!' ) for filepath in filepaths: for searchString in self.searchStrings: stringFound = False # self.findFiles() guarantees that file at filepath exists, # hence no exception handling with open(filepath) as file: for line in file: if re.search(searchString, line): if self.failIfFound is True: logger.info( 'The string %s has been found in file %s, FileChecker will fail job(%s)', searchString, filepath, job.fqid) return self.failure stringFound = True if not stringFound and self.failIfFound is False: logger.info( 'The string %s has not been found in file %s, FileChecker will fail job(%s)', searchString, filepath, job.fqid) return self.failure return self.result
def mergefiles(self, file_list, output_file): import os if isinstance(self.module, IGangaFile): module_name = os.path.join(self.module.localDir, self.module.namePattern) elif isinstance(self.module, File): module_name = self.module.name else: module_name = self.module if not os.path.exists(module_name): raise PostProcessException( "The module '&s' does not exist and so merging will fail.", module_name) result = False try: ns = { 'file_list': copy.copy(file_list), 'output_file': copy.copy(output_file) } execfile(module_name, ns) exec('_result = mergefiles(file_list, output_file)', ns) result = ns.get('_result', result) except Exception as e: raise PostProcessException( 'There was a problem executing the custom merge: %s. Merge will fail.' % e) if result is not True: raise PostProcessException( 'The custom merge did not return True, merge will fail.') return self.success
def check(self, job): if (self.module is None) or not self.module: raise PostProcessException( "No module is specified and so the check will fail.") if (self.module.name is None) or not os.path.isfile(self.module.name): raise PostProcessException( "The module '%s' does not exist and so CustomChecker will do nothing!" % (self.module.name)) result = None try: ns = {'job': job} execfile(self.module.name, ns) exec('_result = check(job)', ns) result = ns.get('_result', result) except Exception as e: raise PostProcessException( 'There was a problem with executing the module: %s, CustomChecker will do nothing!' % e) if result is not True and result is not False: raise PostProcessException( 'The custom check module did not return True or False, CustomChecker will do nothing!' ) if result is not True: logger.info('The custom check module returned False for job(%s)', job.fqid) return self.failure return self.success
def mergefiles(self, file_list, output_file): # if no opts file is specified, then use version from installation # this is the bit specifing the files output_opts = """ outputfile = '%s' """ % output_file output_opts += "\ninput = [" file_sep = ',' for f in file_list: if f is file_list[-1]: file_sep = '' output_opts += "'%s' %s " % (f, file_sep) output_opts += "]" output_opts += """ from GaudiConf import IOHelper IOHelper().inputFiles(input) IOHelper().outStream(outputfile,"InputCopyStream") from Configurables import LHCbApp LHCbApp().EvtMax = -1 """ # write this out to a file opts_file_name = tempfile.mktemp('.py') opts_file = file(opts_file_name, 'w') try: opts_file = file(opts_file_name, 'w') opts_file.write(output_opts) finally: opts_file.close() if not os.path.exists(opts_file_name): msg = "Failed to write temporary options file '%s' during merge" raise PostProcessException(msg % opts_file_name) import EnvironFunctions script_file_name = EnvironFunctions.construct_merge_script( self.version, opts_file_name) return_code = subprocess.call(['/bin/sh', script_file_name]) if return_code != 0: msg = 'The LHCbFileMerger returned %i when calling gaudirun' logger.warning(msg % return_code) # finally clean up os.unlink(script_file_name) os.unlink(opts_file_name) if not os.path.exists(output_file): msg = "The output file '%s' was not created" raise PostProcessException(msg % output_file)
def email(self, job, newstatus): """ Method to email a user about a job """ sender = '*****@*****.**' receivers = self.address subject = 'Ganga Notification: Job(%s) has %s.' % (job.fqid, newstatus) msg_string = """ Dear User,\n Job(%s) has gone into %s state.\n Regards, Ganga\n PS: This is an automated notification from Ganga, if you would like these messages to stop please remove the notifier object from future jobs. """ % (job.fqid, newstatus) msg = email.message_from_string(msg_string) msg['Subject'] = subject msg['From'] = sender msg['To'] = receivers string_message = msg.as_string() try: smtpObj = smtplib.SMTP(config['SMTPHost']) smtpObj.sendmail(sender, receivers, string_message) except smtplib.SMTPException as e: raise PostProcessException(str(e)) return True
def notify(self, job, newstatus): address = self.address key = self.key chat_id = self.chat_id msg = 'Ganga update:\n{}\n{}'.format(job.fqid, newstatus) url = '{}/relay{}/sendMessage'.format(address, key) payload = { 'text': msg, 'chat_id': chat_id, } try: request = requests.post(url, json.dumps(payload)) except RequestException as e: raise PostProcessException(str(e)) else: if request.status_code != 200: raise PostProcessException(str(e)) return True
def mergefiles(self, file_list, output_file): logger.info('merging') if self.wait > 0: logger.info('sleeping for %d seconds' % self.wait) import time time.sleep(self.wait) if self.alwaysfail: raise PostProcessException( 'This merge will always fail as this is a test')
def mergefiles(self, file_list, output_file): from Ganga.Utility.root import getrootprefix, checkrootprefix rc, rootprefix = getrootprefix() if rc != 0: raise PostProcessException( 'ROOT has not been properly configured. Check your .gangarc file.' ) if checkrootprefix(): raise PostProcessException( 'Can not run ROOT correctly. Check your .gangarc file.') # we always force as the overwrite is handled by our parent default_arguments = '-f' merge_cmd = rootprefix + 'hadd ' if self.args: # pass any args on merge_cmd += ' %s ' % self.args # don't add a -f unless needed if not default_arguments in merge_cmd: merge_cmd += ' %s ' % default_arguments # add the list of files, output file first arg_list = [output_file] arg_list.extend(file_list) merge_cmd += string.join(arg_list, ' ') rc, out = commands.getstatusoutput(merge_cmd) log_file = '%s.hadd_output' % output_file with open(log_file, 'w') as log: log.write('# -- Hadd output -- #\n') log.write('%s\n' % out) if rc: logger.error(out) raise PostProcessException( 'The ROOT merge failed to complete. The command used was %s.' % merge_cmd)
def check(self, job): """ Checks metadata of job is within a certain range. """ if self.expression == None: raise PostProcessException( 'No expression is set. MetaDataChecker will do nothing!') try: self.result = self.calculateResult(job) except Exception as e: raise PostProcessException( 'There was an error parsing the checker expression: %s - MetaDataChecker will do nothing!' % e) if self.result is not True and self.result is not False: raise PostProcessException( 'The expression "%s" did not evaluate to True or False, MetaDataChecker will do nothing!' % self.expression) if self.result is False: logger.info( 'MetaDataChecker has failed job(%s) because the expression "%s" is False' % (job.fqid, self.expression)) return self.result
def calculateResult(self, j): """ """ inputevents = None outputevents = None lumi = None nskipped = None nfiles = None if self.expression.find('inputevents') > -1: try: inputevents = j.metadata['events']['input'] except Exception, err: logger.error("%s" % str(err)) raise PostProcessException( "The metadata value j.events['input'] was not defined")
def findFiles(self, job): if not len(self.files): raise PostProcessException( 'No files specified, %s will do nothing!' % getName(self)) filepaths = [] for f in self.files: filepath = os.path.join(job.outputdir, f) for expanded_file in glob.glob(filepath): filepaths.append(expanded_file) if not len(glob.glob(filepath)): if (self.filesMustExist): logger.info( 'The files %s does not exist, %s will fail job(%s) (to ignore missing files set filesMustExist to False)', filepath, self._name, job.fqid) self.result = False else: logger.warning( 'Ignoring file %s as it does not exist.', filepath) return filepaths
def check(self,job): IsMC = 0 IsGenie = 0 IsNeut = 0 if self.trig == 'SPILL': self.TRIGTYPE = 'spill' elif self.trig == 'COSMIC': self.TRIGTYPE = 'cosmic' elif self.trig == 'MCP' or self.trig == 'MCSAND': self.TRIGTYPE = 'spill' IsMC = 1 elif self.trig == 'MCCOS': self.TRIGTYPE = 'all' IsMC = 1 else: #print "Unknown type of data: "+self.trig #self.move_outs(job,ok=False) #return False raise PostProcessException("Unknown type of data: "+self.trig) if not self.site: #print 'Site is not given' #self.move_outs(job,ok=False) #return False raise PostProcessException('Site is not given') # finds .log file self.files = ['*.log'] filepaths = self.findFiles(job) if len(filepaths) != 1: #print 'Something wrong with logfile(s) '+filepaths+'. CANNOT CONTINUE' logger.error('Something wrong with logfile(s) '+filepaths+'. CANNOT CONTINUE') self.move_output(job,ok=False) return False self.logf = filepaths[0] filename = os.path.basename(self.logf) chunks = filename.split('_') chunks = chunks[3].split('-') self.RUN = chunks[0] self.SUBRUN = chunks[1] if not os.path.exists(self.logf): #print "Log file "+filename+" not found. Exit on error" logger.error("Log file "+filename+" not found. Exit on error") self.move_output(job,ok=False) return False # Check generator if this is a beam MC if IsMC == 1: if filename.find('oa_nt_')>=0: #print "This is a NEUT MC log file" logger.info("This is a NEUT MC log file") IsNeut = 1 elif filename.find('oa_gn_')>=0: #print "This is a GENIE MC log file" logger.info("This is a GENIE MC log file") IsGenie = 1 if IsMC == 1: self.range = 0 # no range added to paths #print "Starting to scan file "+filename #print "for run %s, subrun %s, type %s" % (self.RUN,self.SUBRUN,self.trig) logger.info("Starting to scan file "+filename) logger.info("for run %s, subrun %s, type %s" % (self.RUN,self.SUBRUN,self.trig)) inlogf = open(self.logf) for self.line in inlogf: self.line = self.line.strip('\n') if self.find('Midas File') and self.find('has been truncated'): #print self.line #print "Midas file probably missing" logger.error('%s\n%s',self.line,"Midas file probably missing") self.ReturnCode = -1 self.STAGE = "cali" self.send_status() self.InStage = 0 break elif self.find('Starting job for neutMC.'): # neutMC logs are filled by Fluka, they are huge and seems useless logger.info(self.line+" The rest of log is ignored.") self.ReturnCode = 1 self.STAGE = "neutMC" self.send_status() self.InStage = 0 break elif self.find('Starting job for nd280MC'): self.InStage = 1 self.STAGE = "nd280MC" #print self.line logger.info(self.line) elif self.find('Starting job for elecSim'): self.InStage = 1 self.STAGE = "elecSim" #print self.line logger.info(self.line) elif self.find('Starting job for oaCosmicTrigger'): self.InStage = 1 self.STAGE = "COSMICTRIG" #print self.line logger.info(self.line) elif self.find('Starting job for oaCalib'): self.InStage = 1 self.STAGE = "cali" #print self.line logger.info(self.line) elif self.find('Starting job for oaRecon'): self.InStage = 1 self.STAGE = "reco" #print self.line logger.info(self.line) elif self.find('Starting job for oaAnalysis'): self.InStage = 1 self.STAGE = "anal" #print self.line logger.info(self.line) elif self.find('Starting job for '): #print self.line logger.info(self.line) elif self.find('Found Command event_select '): #print self.line logger.info(self.line) chunks = self.line.split() self.TRIGTYPE = chunks[5] elif self.InStage == 1: if self.find('Segmentation fault'): #print self.line logger.error(self.line) if self.line == '"oaCherryPicker-geo_v5mr.bat: line 7"': #print "This is an acceptable error - ignore it" logger.warning("This is an acceptable error - ignore it") else: self.ReturnCode = -2 self.InStage = 0 self.send_status() break elif self.find('Disk quota exceeded'): #print self.line logger.error(self.line) self.ReturnCode = -3 self.InStage = 0 self.send_status() break elif self.find(' ERROR: No database for spillnum'): #print self.line logger.error(self.line) self.ReturnCode = -8 self.InStage = 0 self.send_status() break elif self.find(' No BSD data available'): #print self.line logger.error(self.line) self.ReturnCode = -8 self.InStage = 0 self.send_status() break elif self.find('Disabling module '): #print "IsMC = "+str(IsMC) logger.info("IsMC = "+str(IsMC)) if IsMC == 1: #print "After testing IsMC = "+str(IsMC) logger.info("After testing IsMC = "+str(IsMC)) if self.find('Disabling module GRooTrackerVtx') and IsGenie: #print "Atest "+self.line logger.error("Atest "+self.line) self.ReturnCode = -4 self.InStage = 0 self.send_status() break elif self.find('Disabling module GRooTrackerVtx') and IsNeut: #print "Btest "+self.line logger.error("Btest "+self.line) self.ReturnCode = -4 self.InStage = 0 self.send_status() break elif not self.find('RooTracker'): #print "Ctest "+self.line logger.error("Ctest "+self.line) self.ReturnCode = -4 self.InStage = 0 self.send_status() break elif self.find('probably not closed, trying to recover'): #print self.line logger.error(self.line) self.ReturnCode = -6 self.InStage = 0 self.send_status() break elif self.find('St9bad_alloc'): #print self.line logger.error(self.line) self.ReturnCode = -7 self.InStage = 0 self.send_status() break elif self.find('No luck connecting to GSC MySQL server'): #print self.line logger.error(self.line) self.ReturnCode = -9 self.InStage = 0 self.send_status() break elif self.find('EProductionException'): #print self.line logger.error(self.line) self.ReturnCode = -11 self.InStage = 0 self.send_status() break elif self.find('Total Events Read'): chunks = self.line.split() self.EventsIn = chunks[3] elif self.find('Total Events Written'): chunks = self.line.split() self.EventsOut = chunks[3] elif self.find('Number of events ='): chunks = self.line.split() if self.STAGE == 'nd280MC': self.EventsOut = chunks[5] elif self.find('Total number of events processed in Analysis'): chunks = self.line.split() self.EventsOut = chunks[9] elif self.find('Job Completed Successfully'): #print self.line logger.info(self.line) nextline = inlogf.next() if nextline.find('Run time')>=0: chunks = nextline.split() Time = chunks[6] chunks = Time.split('.') self.Time = chunks[0] self.ReturnCode = 1 if self.STAGE == 'COSMICTRIG': #print "Not sure what this stage is yet - no call to post_status" logger.warning("Not sure what this stage is yet - no call to post_status") else: self.send_status() self.reset_variables() self.InStage = 0 if self.InStage == 1: #print "The stage "+self.STAGE+" has not completed succesfully, Error is unknown" logger.error("The stage "+self.STAGE+" has not completed succesfully, Error is unknown") self.ReturnCode = 0 self.send_status() inlogf.close() #print "\nFinished scanning the log file. Last check return code posted is "+str(self.ReturnCode) logger.info("Finished scanning the log file. Last check return code posted is "+str(self.ReturnCode)) self.move_output(job) return self.ReturnCode == 1
def check(self, job): """ Check that ROOT files are not zombies and were closed properly, also (for master job only) checks that the merging performed correctly. """ import ROOT self.result = True filepaths = self.findFiles(job) if self.result is False: return self.failure if not len(filepaths): raise PostProcessException( 'None of the files to check exist, RootFileChecker will do nothing!' ) for f in filepaths: if f.find('.root') < 0: raise PostProcessException( 'The file "%s" is not a ROOT file, RootFileChecker will do nothing!' % os.path.basename(f)) if not self.checkMergeable(f): return self.failure if (len(job.subjobs) and self.checkMerge): haddoutput = f + '.hadd_output' if not os.path.exists(haddoutput): logger.warning( 'Hadd output file %s does not exist, cannot perform check on merging.', haddoutput) return self.success for failString in [ 'Could not find branch', 'One of the export branches', 'Skipped file' ]: grepoutput = commands.getoutput('grep "%s" %s' % (failString, haddoutput)) if len(grepoutput): logger.info( 'There was a problem with hadd, the string "%s" was found. Will fail job', failString) return self.failure tf = ROOT.TFile.Open(f) mastertrees = GetTreeObjects(tf) entries_dict = {} for sj in job.subjobs: if (sj.status == 'completed'): for subfile in self.findFiles(sj): if (os.path.basename(subfile) == os.path.basename( f)): subtf = ROOT.TFile.Open(subfile) subtrees = GetTreeObjects(subtf) substructure = sorted(subtrees.keys()) masterstructure = sorted(mastertrees.keys()) if (substructure != masterstructure): logger.info( 'File structure of subjob %s is not the same as master job, failing job', sj.fqid) return self.failure if not self.checkBranches( mastertrees, subtrees): logger.info( 'The tree structure of subjob %s is not the same as merged tree, failing job', sj.fqid) return self.failure entries_dict = self.addEntries( mastertrees, subtrees, entries_dict) subtf.Close() master_entries_dict = dict( (n, mastertrees[n].GetEntries()) for n in set(mastertrees)) if (SortedValues(entries_dict) != SortedValues(master_entries_dict)): logger.info( 'Sum of subjob tree entries is not the same as merged tree entries for file %s, failing job (check hadd output)', os.path.basename(f)) return self.failure tf.Close() return self.result
class LHCbMetaDataChecker(MetaDataChecker): """ Checks the meta data of a job is within some range, Currently accepts 'lumi', 'inputevents', 'outputevents', 'nskipped' and 'nfiles'. For example do: mc = LHCbMetaDataChecker() mc.expression = 'nskipped == 0' j.postprocessors.append(mc) to fail jobs which skip some input files. """ _schema = MetaDataChecker._schema.inherit_copy() _category = 'postprocessor' _name = 'LHCbMetaDataChecker' _exportmethods = ['check'] def calculateResult(self, j): """ """ inputevents = None outputevents = None lumi = None nskipped = None nfiles = None if self.expression.find('inputevents') > -1: try: inputevents = j.metadata['events']['input'] except Exception, err: logger.error("%s" % str(err)) raise PostProcessException( "The metadata value j.events['input'] was not defined") if self.expression.find('outputevents') > -1: try: outputevents = j.metadata['events']['output'] except: raise PostProcessException( "The metadata value j.events['output'] was not defined") if self.expression.find('lumi') > -1: try: lumi = float( j.metadata['lumi'][1:j.metadata['lumi'].find(' ')]) except: raise PostProcessException( "The metadata value j.lumi was not defined") if self.expression.find('nskipped') > -1: try: nskipped = len(j.metadata['xmlskippedfiles']) except: raise PostProcessException( "The metadata value j.xmlskippedfiles was not defined") if self.expression.find('nfiles') > -1: try: nfiles = float(j.metadata['xmldatanumbers']['full']) except: raise PostProcessException( "The metadata value j.xmldatanumbers was not defined") return eval(self.expression)
def merge(self, jobs, outputdir=None, ignorefailed=None, overwrite=None): if ignorefailed == None: ignorefailed = self.ignorefailed if overwrite == None: overwrite = self.overwrite from Ganga.GPIDev.Lib.Job import Job if not outputdir: outputdir = getDefaultMergeDir() else: if isType(outputdir, Job): # use info from job outputdir = outputdir.outputdir else: outputdir = os.path.expanduser(outputdir) files = {} if isType(jobs, Job): if outputdir is None: outputdir = jobs.outputdir return self.merge(jobs.subjobs, outputdir=outputdir, ignorefailed=ignorefailed, overwrite=overwrite) if not len(jobs): logger.warning( 'The jobslice given was empty. The merge will not continue.') return self.success for j in jobs: # first check that the job is ok if j.status != 'completed': # check if we can keep going if j.status == 'failed' or j.status == 'killed': if ignorefailed: logger.warning( 'Job %s has status %s and is being ignored.', j.fqid, j.status) continue else: raise PostProcessException( 'Job %s has status %s and so the merge can not continue. ' 'This can be overridden with the ignorefailed flag.' % (j.fqid, j.status)) else: raise PostProcessException( "Job %s is in an unsupported status %s and so the merge can not continue. '\ 'Supported statuses are 'completed', 'failed' or 'killed' (if the ignorefailed flag is set)." % (j.fqid, j.status)) if len(j.subjobs): sub_result = self.merge(j.subjobs, outputdir=j.outputdir, ignorefailed=ignorefailed, overwrite=overwrite) if (sub_result == self.failure) and not ignorefailed: raise PostProcessException( 'The merge of Job %s failed and so the merge can not continue. ' 'This can be overridden with the ignorefailed flag.' % j.fqid) import glob for f in self.files: for matchedFile in glob.glob(os.path.join(j.outputdir, f)): relMatchedFile = '' try: relMatchedFile = os.path.relpath( matchedFile, j.outputdir) except Exception as err: logger.debug("Err: %s" % str(err)) Ganga.Utility.logging.log_unknown_exception() relMatchedFile = relpath(matchedFile, j.outputdir) if relMatchedFile in files: files[relMatchedFile].append(matchedFile) else: files[relMatchedFile] = [matchedFile] if not len(glob.glob(os.path.join(j.outputdir, f))): if ignorefailed: logger.warning( 'The file pattern %s in Job %s was not found. The file will be ignored.', str(f), j.fqid) continue else: raise PostProcessException( 'The file pattern %s in Job %s was not found and so the merge can not continue. ' 'This can be overridden with the ignorefailed flag.' % (str(f), j.fqid)) # files[f].extend(matchedFiles) for k in files.keys(): # make sure we are not going to over write anything outputfile = os.path.join(outputdir, k) if os.path.exists(outputfile) and not overwrite: raise PostProcessException( 'The merge process can not continue as it will result in over writing. ' 'Either move the file %s or set the overwrite flag to True.' % str(outputfile)) # make the directory if it does not exist if not os.path.exists(outputdir): os.makedirs(outputdir) # recreate structure from output sandbox outputfile_dirname = os.path.dirname(outputfile) if outputfile_dirname != outputdir: if not os.path.exists(outputfile_dirname): os.mkdir(outputfile_dirname) # check that we are merging some files if not files[k]: logger.warning( 'Attempting to merge with no files. Request will be ignored.' ) continue # check outputfile != inputfile for f in files[k]: if f == outputfile: raise PostProcessException( 'Output file %s equals input file %s. The merge will fail.' % (outputfile, f)) # merge the lists of files with a merge tool into outputfile msg = None try: self.mergefiles(files[k], outputfile) # create a log file of the merge # we only get to here if the merge_tool ran ok log_file = '%s.merge_summary' % outputfile with open(log_file, 'w') as log: log.write('# -- List of files merged -- #\n') for f in files[k]: log.write('%s\n' % f) log.write('# -- End of list -- #\n') except PostProcessException as e: msg = str(e) # store the error msg log_file = '%s.merge_summary' % outputfile with open(log_file, 'w') as log: log.write('# -- Error in Merge -- #\n') log.write('\t%s\n' % msg) raise e return self.success