def loadJobReport(self): """ _loadJobReport_ Extract the JobReport from the job report file if it exists """ if not os.path.exists(self.jobReport): return jobReport = readJobReport(self.jobReport)[0] self._JobReport = jobReport self.jobReportLoaded = True # // # // Convert PFNs to absolute paths if they exist in this #// directory for fileInfo in self._JobReport.files: pfn = fileInfo['PFN'] if pfn.startswith("file:"): pfn = pfn.replace("file:", "") pfnPath = os.path.join(self.dir, pfn) if not os.path.exists(pfnPath): continue fileInfo['PFN'] = pfnPath return
def extract_info(report_filename): exit_code = 0 skipped = [] infos = {} written = 0 with open(report_filename) as f: for report in readJobReport(f): for error in report.errors: exit_code = error.get('ExitStatus', exit_code) for file in report.skippedFiles: skipped.append(file['Lfn']) for file in report.files: written += int(file['TotalEvents']) for file in report.inputFiles: filename = file['LFN'] if len(file['LFN']) > 0 else file['PFN'] file_lumis = [] try: for run, ls in file['Runs'].items(): for lumi in ls: file_lumis.append((run, lumi)) except AttributeError: print 'Detected file-based job.' infos[filename] = (int(file['EventsRead']), file_lumis) eventtime = report.performance.summaries['Timing']['TotalEventCPU'] cputime = report.performance.summaries['Timing']['TotalJobCPU'] return infos, skipped, written, exit_code, eventtime, cputime
def updateReport(reportFile, newReportInstance): """ _updateReport_ Given a file containing several reports: reportFile, find the report in there whose name matches the newReportInstance's name and replace that report with the new Report instance. Returns a boolean: True if report name was matched and updated, False if the report was not found and updated. (False may indicate that the new report file needs to be merged with the main report file) """ if not os.path.exists(reportFile): existingReports = [] else: existingReports = readJobReport(reportFile) updatedReport = False output = IMProvDoc("JobReports") for report in existingReports: if report.name == newReportInstance.name: output.addNode(newReportInstance.save()) updatedReport = True else: output.addNode(report.save()) handle = open(reportFile, 'w') handle.write(output.makeDOMDocument().toprettyxml()) handle.close() return updatedReport
def __call__(self): """ _operator()_ Callable funtions to handle job report """ logging.info ('read job report.......') # remove file:// from file name (if any) jobReport = self.reportFile.replace('file://','') # verify the file exists if not os.path.exists(jobReport): logging.error("Cannot process JobSuccess event: " \ + "job report %s does not exist." % jobReport) return None # read the report try: self.extend(readJobReport(jobReport)) # check errors except Exception, msg: logging.error("Cannot process JobSuccess event for %s: %s" \ % (jobReport, msg)) return None
def parseFinalReport(self, reportfilename, job): """ __parseFinalReport__ Parses the FWJR produced by job in order to retrieve the WrapperExitCode and ExeExitCode. Updates the BossDB with these values. """ # defaults success = False # skip empty files if os.path.getsize(reportfilename) == 0 : raise InvalidReport, "FrameworkJobReport is empty." # check for success (needed for chain jobs success = checkSuccess(reportfilename) # read standard info # FIXME: avoid reading twice the same file! try : reports = readJobReport(reportfilename) except Exception, err: logging.error('Invalid Framework Job Report : %s' %str(err) ) raise InvalidReport, "Invalid FrameworkJobReport."
def checkSuccess(jobReportFile): """ _checkSuccess_ Read a FrameworkJobReport XML file and verify that all reports evaulate as successful. If a report is not successful, False is returned (JobFailed) If all reports in the file are successful, True is returned (JobSuccess) """ try: reports = readJobReport(jobReportFile) except: # exception indicates bad report file => Implies failure return False if len(reports) == 0: return False for report in reports: if report.wasSuccess(): continue else: return False return True
def handleError(self,payload): jobReport=readJobReport(payload) if len(jobReport) == 0: logging.error("Error parsing FWJR: %s" % payload) jobId = jobReport[0].jobSpecId logging.debug(">MergeRunFailureHandler<: do nothing 4 the moment")
def checkValidFJR(self): """ """ valid = 0 fjr = readJobReport(self.reportFileName) if len(fjr) > 0: valid = 1 return valid
def fillFJR(self): """ """ valid = self.checkValidFJR() if valid == 1 and self.directive == "--errorcode": jobReport = readJobReport(self.reportFileName)[0] if len(jobReport.errors) > 0: error = 0 for err in jobReport.errors: if err["Type"] == "WrapperExitCode": err["ExitStatus"] = self.wrapperExitCode jobReport.write(self.reportFileName) error = 1 if self.exeExitCode != "": if err["Type"] == "ExeExitCode": err["ExitStatus"] = self.exeExitCode jobReport.write(self.reportFileName) error = 1 if error == 0: jobReport.addError(self.wrapperExitCode, "WrapperExitCode") if self.exeExitCode != "": jobReport.addError(self.exeExitCode, "ExeExitCode") jobReport.write(self.reportFileName) else: jobReport.addError(self.wrapperExitCode, "WrapperExitCode") if self.exeExitCode != "": jobReport.addError(self.exeExitCode, "ExeExitCode") jobReport.write(self.reportFileName) elif valid == 1 and self.directive == "--timing": jobReport = readJobReport(self.reportFileName)[0] # add here timing settings perf = jobReport.performance perf.addSummary( "CrabTiming", WrapperTime=self.wrapperTime, ExeTime=self.exeTime, StageoutTime=self.stageoutTime, CpuTime=self.cpuTime, ) jobReport.write(self.reportFileName) pass else: self.writeFJR()
def extract_info(config, data, report_filename): """Extract job data from a framework report. Analyze the CMSSW job framework report to get the CMSSW exit code, skipped files, runs and lumis processed on a file basis, total events written, and CPU time overall and per event. """ exit_code = 0 skipped = [] infos = {} written = 0 eventsPerRun = 0 with open(report_filename) as f: for report in readJobReport(f): for error in report.errors: code = error.get('ExitStatus', exit_code) if exit_code == 0: exit_code = code for file in report.skippedFiles: filename = file['Lfn'] filename = config['file map'].get(filename, filename) skipped.append(file['Lfn']) for file in report.files: written += int(file['TotalEvents']) for file in report.inputFiles: filename = file['LFN'] if len(file['LFN']) > 0 else file['PFN'] filename = config['file map'].get(filename, filename) file_lumis = [] try: for run, ls in file['Runs'].items(): for lumi in ls: file_lumis.append((run, lumi)) except AttributeError: print 'Detected file-based job.' infos[filename] = (int(file['EventsRead']), file_lumis) eventsPerRun += infos[filename][0] eventtime = report.performance.summaries['Timing']['TotalEventCPU'] cputime = report.performance.summaries['Timing']['TotalJobCPU'] data['files']['info'] = infos data['files']['skipped'] = skipped data['events written'] = written data['cmssw exit code'] = exit_code # For efficiency, we care only about the CPU time spent processing # events data['cpu time'] = eventtime data['events per run'] = eventsPerRun return cputime
def run(self): """ parse of all xml file on res dir and creation of dictionary """ task = common._db.getTask() good_list=[] # list of fjr's to publish for job in task.getJobs(): fjr = self.fjrDirectory + job['outputFiles'][-1] if (job.runningJob['applicationReturnCode']!=0 or job.runningJob['wrapperReturnCode']!=0): continue # get FJR filename fjr = self.fjrDirectory + job['outputFiles'][-1] reports = readJobReport(fjr) if len(reports)>0: if reports[0].status == "Success": good_list.append(fjr) if len(good_list) == 0: common.logger.info("No fjr with exit code =0 to be published") status = '0' return status common.logger.info("Task has %d jobs"%len(task.getJobs())) common.logger.info("Found %d fjr's with exit code=0 to be considere for publication"%len(good_list)) pubToDBS2 = False pubToDBS3 = True if self.cfg_params.get('CMSSW.publish_dbs2',None)=="1": pubToDBS2 = True pubToDBS3 = False if pubToDBS2 : status = self.DBS2Publish(good_list) elif pubToDBS3: argsForDbs3 = self.PrepareForDBS3Publish(good_list) globalApi = argsForDbs3['globalApi'] sourceApi = argsForDbs3['sourceApi'] inputDataset = argsForDbs3['inputDataset'] toPublish = argsForDbs3['toPublish'] destApi = argsForDbs3['destApi'] destReadApi = argsForDbs3['destReadApi'] migrateApi = argsForDbs3['migrateApi'] originSite = argsForDbs3['origin_site_name'] (failed,published,results) = publishInDBS3(\ sourceApi, globalApi, inputDataset, toPublish, destApi, destReadApi, migrateApi, originSite) if len(failed) == 0: status='0' else: status='1' else: raise CrabException('Could not define wether to publish to DBS2 or DBS3') return status
def setStatus(self): """ """ if (self.wrapperExitCode == "0") and (self.exeExitCode == "0"): status = "Success" else: status = "Failed" jobReport = readJobReport(self.reportFileName)[0] jobReport.status = status jobReport.write(self.reportFileName) return
def wasJobFailure(self): """ _wasJobFailure_ Check wether the job is flagged as a faillure in the toplevel report return boolean, True if job was a failure """ status = False toplevelReport = os.path.join(os.environ['PRODAGENT_JOB_DIR'],"FrameworkJobReport.xml") toplevelReps = readJobReport(toplevelReport) for rep in toplevelReps: if not rep.wasSuccess(): status = True return status
def ExtractOuputPerJob(ArchiveFileList,ExtractDir,debug): """ open archive in ExtractDir and extract performance information """ jobs = [] counter = 0 for archive in ArchiveFileList : # extract the FrameworkJobReport.xml if os.path.isfile(archive) and tarfile.is_tarfile(archive): counter += 1 if counter%100 == 0: print 'Archive counter',counter jobtarfile = tarfile.open(archive, 'r:gz') reportFileList = [tf for tf in jobtarfile.getnames() if tf.count("FrameworkJobReport.xml")] if len(reportFileList) > 0 : reportFile=reportFileList[0] jobtarfile.extract(reportFile,ExtractDir) reportfilename = os.path.join(ExtractDir,reportFile) if os.path.isfile(reportfilename) : if debug: print 'Opening FrameworkJobReport:',reportfilename,'from archive:',archive reports = readJobReport(reportfilename) timePerJob = 0 size = 0 for report in reports: tmp_size = 0 tmp_timePerJob = 0 for file in report.files : tmp_size += float(file['Size']) / 1024. / 1024. tmp_timePerJob += float(report.timing['AppEndTime']) - float(report.timing['AppStartTime']) # tmp_timePerJob += float(report.timing['StageOutEnd']) - float(report.timing['StageOutStart']) if debug == 1 : print 'report:',tmp_timePerJob,tmp_size timePerJob += tmp_timePerJob size += tmp_size if debug == 1 : print 'total:',timePerJob,size jobs.append(size/timePerJob) else: print 'Archive:',archive,'does not contain a FrameworkJobReport.xml, skipping!' else: print 'Archive:',archive,'is not a gzipp\'ed tarball, skipping!' return jobs
def mergeReports(reportFile1, reportFile2): """ _mergeReports_ Load job reports from both files, and combine them into a single file. The output will be written to the first file provided. (IE JobReports from reportFile2 will be added to reportFile1) If reportFile1 does not exist, a new report will be created, containing the contents of reportFile2. If reportFile2 does not exist, then a RuntimeError is thrown. """ if not os.path.exists(reportFile1): reports1 = [] else: reports1 = readJobReport(reportFile1) if not os.path.exists(reportFile2): msg = "Report file to be merged does not exist:\n" msg += reportFile2 raise RuntimeError, msg reports2 = readJobReport(reportFile2) reports1.extend(reports2) output = IMProvDoc("JobReports") for item in reports1: output.addNode(item.save()) handle = open(reportFile1, 'w') handle.write(output.makeDOMDocument().toprettyxml()) handle.close() return
def jobEnd(self): """ Job ended notifier. """ if self.dashboardInfo == None: return newInfo = self.dashboardInfo.emptyClone() newInfo.addDestination(self.destHost, self.destPort) try: reports = readJobReport("FrameworkJobReport.xml") newInfo['JobExitStatus'] = reports[-1].exitCode newInfo.update(PerformanceSummary()(*reports)) except: newInfo['JobExitStatus'] = 50116 newInfo['JobFinished'] = time.time() newInfo.publish(1)
def merge_reports(self): reports = [] tdir = os.path.dirname(self.__jobdir) for path in self.__rpaths: f = gzip.open(os.path.join(os.path.dirname(tdir), path, 'report.xml.gz')) report = readJobReport(f) reports.extend(report) f.close() output = IMProvDoc("JobReports") for item in reports: output.addNode(item.save()) outfile = gzip.open(os.path.join(self.__jobdir, 'report.xml.gz'), 'wb') outfile.write(output.makeDOMDocument().toprettyxml()) outfile.close()
def publishAJobReport(self,file,procdataset): """ input: xml file, processedDataset """ common.logger.debug("FJR = %s"%file) try: jobReport = readJobReport(file)[0] self.exit_status = '0' except IndexError: self.exit_status = '1' msg = "Error: Problem with "+file+" file" raise CrabException(msg) ### skip publication for 0 events files filestopublish=[] for file in jobReport.files: #### added check for problem with copy to SE and empty lfn if (string.find(file['LFN'], 'copy_problems') != -1): self.problemFiles.append(file['LFN']) elif (file['LFN'] == ''): self.noLFN.append(file['PFN']) else: if int(file['TotalEvents']) == 0: self.noEventsFiles.append(file['LFN']) for ds in file.dataset: ### Fede for production if (ds['PrimaryDataset'] == 'null'): ds['PrimaryDataset']=self.userprocessedData filestopublish.append(file) jobReport.files = filestopublish for file in filestopublish: common.logger.debug("--->>> LFN of file to publish = " + str(file['LFN'])) ### if all files of FJR have number of events = 0 if (len(filestopublish) == 0): return None #// DBS to contact dbswriter = DBSWriter(self.DBSURL) # insert files Blocks=None try: ### FEDE added insertDetectorData = True to propagate in DBS info about run and lumi Blocks=dbswriter.insertFiles(jobReport, insertDetectorData = True) #Blocks=dbswriter.insertFiles(jobReport) common.logger.debug("--->>> Inserting file in blocks = %s"%Blocks) except DBSWriterError, ex: common.logger.debug("--->>> Insert file error: %s"%ex)
def run(self): # load FwkJobRep try: jobReport = readJobReport(self.input)[0] except: print '50115' sys.exit() if self.exitCode : self.exitCodes(jobReport) if self.lfnList : self.lfn_List(jobReport) if self.info2dash : self.reportDash(jobReport) if self.popularity: self.popularityInfos(jobReport) return
def ExtractFailureInformation(archive,ExtractDir,debug): """ open archive in ExtractDir and extract failure information use first description and LFN found, ignore following """ # extract the FrameworkJobReport.xml if os.path.isfile(archive) and tarfile.is_tarfile(archive): jobtarfile = tarfile.open(archive, 'r:gz') reportFileList = [tf for tf in jobtarfile.getnames() if tf.count("FrameworkJobReport")] reportFileList.sort() result = {} if len(reportFileList) > 0 : reportFile=reportFileList[-1] jobtarfile.extract(reportFile,ExtractDir) reportfilename = os.path.join(ExtractDir,reportFile) if os.path.isfile(reportfilename) : if debug: print 'Opening FrameworkJobReport:',reportfilename,'from archive:',archive reports = readJobReport(reportfilename) for report in reports: logfile = None failure = None if len(report.logFiles) > 0 : logfile = report.logFiles.keys()[0] if len(report.errors) > 0 : failure = report.errors[0]['Description'].replace('\t','') result[logfile] = failure else: print 'Archive:',archive,'does not contain a FrameworkJobReport*.xml, skipping!' return result else: print 'Archive:',archive,'is not a gzipp\'ed tarball, skipping!' return None
def execute(self,stateParameters={}): logging.debug("Executing state: ReportJobSuccess") # examine if the job is a failure or not and treat it appropiately if stateParameters['jobType'] == 'failure': job_spec_id=stateParameters['jobReport'] else: jobReport=stateParameters['jobReport'] # retrieve relevant information: report=readJobReport(jobReport) logging.debug('jobreport is: '+str(jobReport)) try: logging.debug('jobspecid is: '+str(report[-1].jobSpecId)) except Exception,ex: msg = """WARNING: Something might be wrong with the generated job report. Unless this is a merge job or a non prodmgr job then you can ignore this message. If it is a prodmgr process job check if it exists and if it is proper formatted. ProdMgr will ignore this job as it has not sufficient information to handle this. It might be that this is prodmgr job in which case some residu information is left in the database. %s """ %(str(ex)) logging.debug(msg) return job_spec_id=report[-1].jobSpecId
def run(self): """ parse of all xml file on res dir and creation of dictionary """ task = common._db.getTask() good_list = [] # list of fjr's to publish common.logger.info("Listing crab_fjr files") nj = 0 for job in task.getJobs(): nj += 1 if nj % 100 == 0: common.logger.info("checking job %d" % nj) fjr = self.fjrDirectory + job['outputFiles'][-1] if (job.runningJob['applicationReturnCode'] != 0 or job.runningJob['wrapperReturnCode'] != 0): continue # get FJR filename fjr = self.fjrDirectory + job['outputFiles'][-1] reports = readJobReport(fjr) if len(reports) > 0 and reports[0].status == "Success": goodReport = True # sanity check : is there at least one run ? for outFile in reports[0].files: if len(outFile['Runs']) == 0: msg = "ERROR: no run/lumi info. Skip FJR file %s" % fjr common.logger.info(msg) goodReport = False if not goodReport: continue good_list.append(fjr) if len(good_list) == 0: common.logger.info("No fjr with exit code =0 to be published") status = '0' return status common.logger.info("Task has %d jobs" % len(task.getJobs())) common.logger.info( "Found %d fjr's with exit code=0 to be considere for publication" % len(good_list)) pubToDBS2 = False pubToDBS3 = True if self.cfg_params.get('CMSSW.publish_dbs2', None) == "1": pubToDBS2 = True pubToDBS3 = False if pubToDBS2: status = self.DBS2Publish(good_list) elif pubToDBS3: argsForDbs3 = self.PrepareForDBS3Publish(good_list) globalApi = argsForDbs3['globalApi'] sourceApi = argsForDbs3['sourceApi'] inputDataset = argsForDbs3['inputDataset'] toPublish = argsForDbs3['toPublish'] destApi = argsForDbs3['destApi'] destReadApi = argsForDbs3['destReadApi'] migrateApi = argsForDbs3['migrateApi'] originSite = argsForDbs3['origin_site_name'] (failed,published,results) = publishInDBS3(\ sourceApi, globalApi, inputDataset, toPublish, destApi, destReadApi, migrateApi, originSite) if len(failed) == 0: status = '0' else: status = '1' else: raise CrabException( 'Could not define wether to publish to DBS2 or DBS3') return status
def ExtractPerformanceInformation(archive,ExtractDir,debug): """ open archive in ExtractDir and extract performance information """ # extract the FrameworkJobReport.xml if os.path.isfile(archive) and tarfile.is_tarfile(archive): jobtarfile = tarfile.open(archive, 'r:gz') reportFileList = [tf for tf in jobtarfile.getnames() if tf.count("FrameworkJobReport.xml")] if len(reportFileList) > 0 : reportFile=reportFileList[0] results = [] ### HERE! result = {} jobtarfile.extract(reportFile,ExtractDir) reportfilename = os.path.join(ExtractDir,reportFile) if os.path.isfile(reportfilename) : if debug: print 'Opening FrameworkJobReport:',reportfilename,'from archive:',archive reports = readJobReport(reportfilename) for report in reports: performanceReport = report.performance # average physical memory try: mem = performanceReport.summaries['RSSMemory']['AvgRSSMemory'] result['AverageMemory'] = float(mem) except: print 'Cannot read RSSMemory:AvgRSSMemory from FrameworkJobReport.xml from',archive result['AverageMemory'] = 0. # max physical memory try: mem = performanceReport.summaries['RSSMemory']['MaxRSSMemory'] result['MaxMemory'] = float(mem) except: print 'Cannot read RSSMemory:MaxRSSMemory from FrameworkJobReport.xml from',archive result['MaxMemory'] = 0. # events per job from trigger report try: events = performanceReport.summaries['TrigReport']['TotalEvents'] result['Events'] = int(events) except: try: events = 0; for inputFile in report.inputFiles: events += int(inputFile['EventsRead']) result['Events'] = events except: print 'Cannot read TrigReport:TotalEvents from FrameworkJobReport.xml from',archive result['Events'] = 0 # time per event try: time = performanceReport.summaries['Timing']['RealPerEvent'] result['TimePerEvent'] = float(time) except: try: timePerJob = float(report.timing['AppEndTime']) - float(report.timing['AppStartTime']) result['TimePerEvent'] = timePerJob / float(result['Events']) except: print 'Cannot read Timing:RealPerEvent from FrameworkJobReport.xml from',archive result['TimePerEvent'] = 0 results.append(result) ### HERE! result = {} ### HERE! #return result return results ### HERE! else: print 'Archive:',archive,'does not contain a FrameworkJobReport.xml, skipping!' return None else: print 'Archive:',archive,'is not a gzipp\'ed tarball, skipping!' return None
def parseFinalReport(self, input): """ Parses the FJR produced by job in order to retrieve the WrapperExitCode and ExeExitCode. Updates the BossDB with these values. """ from ProdCommon.FwkJobRep.ReportParser import readJobReport codeValue = {} jreports = readJobReport(input) if len(jreports) <= 0 : codeValue["applicationReturnCode"] = str(50115) codeValue["wrapperReturnCode"] = str(50115) common.logger.debug("Empty FWkobreport: error code assigned is 50115 ") return codeValue jobReport = jreports[0] exit_status = '' ##### temporary fix for FJR incomplete #### fjr = open (input) len_fjr = len(fjr.readlines()) if (len_fjr <= 6): ### 50115 - cmsRun did not produce a valid/readable job report at runtime codeValue["applicationReturnCode"] = str(50115) codeValue["wrapperReturnCode"] = str(50115) if len(jobReport.errors) != 0 : for error in jobReport.errors: if error['Type'] == 'WrapperExitCode': codeValue["wrapperReturnCode"] = error['ExitStatus'] elif error['Type'] == 'ExeExitCode': codeValue["applicationReturnCode"] = error['ExitStatus'] if error['Type'] == 'CMSException': codeValue["applicationReturnCodeOrig"] = error['ExitStatus'] else: continue if not codeValue.has_key('wrapperReturnCode'): codeValue["wrapperReturnCode"] = '' if not codeValue.has_key('applicationReturnCode'): if codeValue.has_key('applicationReturnCodeOrig'): codeValue["applicationReturnCode"] = \ codeValue["applicationReturnCodeOrig"] codeValue.pop("applicationReturnCodeOrig") else: codeValue["applicationReturnCode"] = '' else: if codeValue.has_key('applicationReturnCodeOrig'): codeValue.pop("applicationReturnCodeOrig") #### Filling BOSS DB with SE name and LFN, for edm and not_edm files #### lfns=[] pfns=[] if (len(jobReport.files) != 0): for f in jobReport.files: if f['LFN']: lfns.append(f['LFN']) if f['PFN']: #### FEDE to have the correct endpoit to use in the copyData (we modify the bossDB value and not the fjr ) if common.scheduler.name().upper() not in ['LSF', 'CAF', 'PBS'] and codeValue["wrapperReturnCode"] == 60308: pfns.append(os.path.dirname(f['SurlForGrid'])+'/') else: pfns.append(os.path.dirname(f['PFN'])+'/') ########## if (len(jobReport.analysisFiles) != 0): for aFile in jobReport.analysisFiles: if aFile['LFN']: lfns.append(aFile['LFN']) if aFile['PFN']: #### FEDE to have the correct endpoit to use in the copyData (we modify the bossDB value and not the fjr ) if common.scheduler.name().upper() not in ['LSF', 'CAF', 'PBS'] and codeValue["wrapperReturnCode"] == 60308: pfns.append(os.path.dirname(aFile['SurlForGrid'])+'/') else: pfns.append(os.path.dirname(aFile['PFN'])+'/') ######### codeValue["storage"] = pfns codeValue["lfn"] = lfns return codeValue
def PrepareForDBS3Publish(self,good_list): from dbs.apis.dbsClient import DbsApi as Dbs3Api from ProdCommon.FwkJobRep.ReportParser import readJobReport # extract information from self and good_list and format as liked by publishInDBS3 function originSite = None (isDbs2, isDbs3, dbs2_url, dbs3_url) = verify_dbs_url(self) sourceUrl = dbs3_url prdDBSurl = 'https://cmsweb.cern.ch/dbs/prod/phys03/' intDBSurl = 'https://cmsweb-testbed.cern.ch/dbs/int/phys03/' devDBSurl = 'https://dbs3-dev01.cern.ch/dbs/dev/phys03/' devDBSurl = 'https://dbs3-testbed.cern.ch/dbs/dev/phys03/' destDBSurl = prdDBSurl if self.cfg_params.get('CMSSW.dbs3-int',None)=="1": destDBSurl = intDBSurl if self.cfg_params.get('CMSSW.dbs3-dev',None)=="1": destDBSurl = devDBSurl pubUrl= self.cfg_params.get('CMSSW.dbs-pub',None) if pubUrl: if not '/' in pubUrl: destDBSurl = 'https://'+ pubUrl +'.cern.ch/dbs/dev/phys03/' else: destDBSurl = 'https://'+ pubUrl +'/' common.logger.info('your dataset will be published in %s' % destDBSurl) destUrl = destDBSurl + 'DBSWriter' destReadUrl = destDBSurl + 'DBSReader' migrateUrl = destDBSurl + 'DBSMigrate' inputDataset = self.cfg_params.get('CMSSW.datasetpath','None') sourceApi = Dbs3Api(url=sourceUrl) # when looking up parents may need to look in global DBS as well globalUrl = sourceUrl # if this is not global, next lines will make it globalUrl = globalUrl.replace('phys01','global') globalUrl = globalUrl.replace('phys02','global') globalUrl = globalUrl.replace('phys03','global') globalUrl = globalUrl.replace('caf','global') globalApi = Dbs3Api(url=globalUrl) destinationApi = Dbs3Api(url=destUrl) # be safe, use RO Api unless really want to write destinationReadApi = Dbs3Api(url=destReadUrl) migrateApi = Dbs3Api(url=migrateUrl) # the publishInDBS3 copied from CRAB3 needs the info in the toPublish dictionary # format is: {outdataset:files} # outdataset = dataset to be published (full name /prim/proc/tier) # files is a list of dictionaries, one per LFN toPublish={} common.logger.info("parsing crab_fjr files...") nfjr=0 for crabFjr in good_list: # this is the list of FJR's in crab res fjr=readJobReport(crabFjr)[0] # parse into python nfjr += 1 if nfjr%100 == 0: common.logger.info ("parsed %d crab_fjr files..." %nfjr) if not fjr.files: msg = "WARNING: No EDM file to be published in %s" % crabFjr.split('/')[-1] common.logger.info(msg) for outFile in fjr.files: # one fjr may have multiple output LFN's dset_info=outFile.dataset[0] # better there is only one dataset per file ! procds=dset_info['ProcessedDataset'] primds=dset_info['PrimaryDataset'] if primds =='null': # user MC, get publishdatane stripping username and hash from procds primds='-'.join(procds.split('-')[1:-1]) tier=dset_info['DataTier'] outdataset="/%s/%s/%s" % (primds, procds,tier) if not toPublish.has_key(outdataset): toPublish[outdataset]=[] fileDic={} # prepare dictionary to publish this LFN fileDic['cksum']=outFile['Checksum'] fileDic['md5']="NOTSET" fileDic['adler32']="NOTSET" fileDic['acquisitionera']="null" fileDic['globaltag']="None" fileDic['publishname']=procds fileDic['outdataset']=outdataset fileDic['swversion']=dset_info['ApplicationVersion'] fileDic['lfn']=outFile['LFN'] fileDic['filesize']=outFile['Size'] fileDic['runlumi']=outFile['Runs'] # beware duplicate parents in FJR and parent='' when FJR had no LFN fileDic['parents']=[p for p in set(outFile.parentLFNs()) if p] if originSite: if outFile['SEName'] != originSite: msg = "ERROR: not all files to be published have same location" msg += "file %s has origin %s, while previous files have %s\n" %\ (outFile['LFN'], outFile['SEName'], originSite) raise CrabException(msg) else: originSite = outFile['SEName'] fileDic['inevents']=outFile['TotalEvents'] toPublish[outdataset].append(fileDic) # add dictionary to files list # all done common.logger.info("parsing of crab_fjr files completed") argsForDbs3 = { \ 'sourceApi' : sourceApi, 'globalApi' : globalApi, 'inputDataset' : inputDataset, 'toPublish' : toPublish, 'destApi' : destinationApi, 'destReadApi' : destinationReadApi, 'migrateApi' : migrateApi, 'origin_site_name' : originSite } return argsForDbs3
def combineReports(reportFile, reportNames, newReportInstance): """ Combine reports, take some fields from report with given name in reportFile and then overwrite with newReportInstance Note: newReportInstance is modified, and should be written back as the task fjr - else subsequent tasks will take the wrong one!!! """ if not os.path.exists(reportFile): existingReports = [] else: existingReports = readJobReport(reportFile) if not isinstance(reportNames, list): reportNames = [reportNames] reportFound = False output = IMProvDoc("JobReports") #wipe old values ready for new ones newReportInstance.inputFiles = [] newReportInstance.generatorInfo = {} #how to handle multiple? for report in existingReports: if report.name in reportNames: reportFound = True # copy some values across from old report newReportInstance.inputFiles.extend(report.inputFiles) newReportInstance.skippedEvents.extend(report.skippedEvents) newReportInstance.skippedFiles.extend(report.skippedFiles) # loop over output files and change provenance to 1st node's for outfile in newReportInstance.files: oldinputfiles = outfile.inputFiles outfile.inputFiles = [] #clear ready for correct provenance info for infile in oldinputfiles: # find the ancestor input files in previous report for ancestor in report.files: if ancestor['LFN'] == infile['LFN']: outfile.inputFiles.extend(ancestor.inputFiles) print "Updated InputFiles %s for %s" % ( ancestor.inputFiles, outfile['LFN']) # No LFN, use PFN (Needed for parent forwarding) elif not infile['LFN'] and \ ancestor['PFN'] == infile['PFN']: outfile.inputFiles.extend(ancestor.inputFiles) print "Updated InputFiles %s for %s" % ( ancestor.inputFiles, outfile['LFN']) if report.timing.has_key('AppStartTime') and \ report.timing['AppStartTime'] < newReportInstance.timing.get('AppStartTime', time.time()): newReportInstance.timing['AppStartTime'] = report.timing['AppStartTime'] continue # // if here either this report is not one of the inputs # // or the report contained a staged out file #// - in either case it must be saved output.addNode(report.save()) if not reportFound: raise RuntimeError, "Reports not combined: %s not found in %s" % \ (str(reportNames), reportFile) output.addNode(newReportInstance.save()) handle = open(reportFile, 'w') handle.write(output.makeDOMDocument().toprettyxml()) handle.close() return newReportInstance
def publishDataset(self,file): """ """ try: jobReport = readJobReport(file)[0] self.exit_status = '0' except IndexError: self.exit_status = '1' msg = "Error: Problem with "+file+" file" common.logger.info(msg) return self.exit_status if (len(self.dataset_to_import) != 0): for dataset in self.dataset_to_import: common.logger.info("--->>> Importing parent dataset in the dbs: " +dataset) status_import=self.importParentDataset(self.globalDBS, dataset) if (status_import == 1): common.logger.info('Problem with parent '+ dataset +' import from the global DBS '+self.globalDBS+ 'to the local one '+self.DBSURL) self.exit_status='1' return self.exit_status else: common.logger.info('Import ok of dataset '+dataset) if (len(jobReport.files) <= 0) : self.exit_status = '1' msg = "Error: No EDM file to publish in xml file"+file+" file" common.logger.info(msg) return self.exit_status else: msg = "fjr contains some files to publish" common.logger.debug(msg) #### datasets creation in dbs #// DBS to contact write and read of the same dbs dbsReader = DBSReader(self.DBSURL,level='ERROR') dbswriter = DBSWriter(self.DBSURL) ##### self.published_datasets = [] for fileinfo in jobReport.files: datasets_info=fileinfo.dataset if len(datasets_info)<=0: self.exit_status = '1' msg = "Error: No info about dataset in the xml file "+file common.logger.info(msg) return self.exit_status else: for dataset in datasets_info: #### for production data self.processedData = dataset['ProcessedDataset'] if (dataset['PrimaryDataset'] == 'null'): dataset['PrimaryDataset'] = self.userprocessedData elif self.datasetpath.upper() != 'NONE': dataset['ParentDataset']= self.datasetpath dataset['PSetContent']=self.content cfgMeta = {'name' : self.pset , 'Type' : 'user' , 'annotation': 'user cfg', 'version' : 'private version'} # add real name of user cfg common.logger.info("PrimaryDataset = %s"%dataset['PrimaryDataset']) common.logger.info("ProcessedDataset = %s"%dataset['ProcessedDataset']) common.logger.info("<User Dataset Name> = /"+dataset['PrimaryDataset']+"/"+dataset['ProcessedDataset']+"/USER") self.dataset_to_check="/"+dataset['PrimaryDataset']+"/"+dataset['ProcessedDataset']+"/USER" self.published_datasets.append(self.dataset_to_check) common.logger.log(10-1,"--->>> Inserting primary: %s processed : %s"%(dataset['PrimaryDataset'],dataset['ProcessedDataset'])) #### check if dataset already exists in the DBS result = dbsReader.matchProcessedDatasets(dataset['PrimaryDataset'], 'USER', dataset['ProcessedDataset']) if (len(result) != 0): result = dbsReader.listDatasetFiles(self.dataset_to_check) primary = DBSWriterObjects.createPrimaryDataset( dataset, dbswriter.dbs) common.logger.log(10-1,"Primary: %s "%primary) print "primary = ", primary algo = DBSWriterObjects.createAlgorithm(dataset, cfgMeta, dbswriter.dbs) common.logger.log(10-1,"Algo: %s "%algo) processed = DBSWriterObjects.createProcessedDataset(primary, algo, dataset, dbswriter.dbs) common.logger.log(10-1,"Processed: %s "%processed) print "processed = ", processed common.logger.log(10-1,"Inserted primary %s processed %s"%(primary,processed)) ####################################################################################### common.logger.log(10-1,"exit_status = %s "%self.exit_status) return self.exit_status
diz[L[i]] = L[i+1] i = i + 2 if diz.has_key('json'): json_file = diz['json'] fp = open(json_file, "r") inputDict = json.load(fp) fp.close() print "inputDict = ", inputDict else: print "Error: no json file provided" sys.exit() if diz.has_key('fjr'): inputReport = diz['fjr'] reports = readJobReport(inputReport) # report is an instance of FwkJobRep.FwkJobReport class # can be N in a file, so a list is always returned # by for readJobReport, here I am assuming just one report per file for simplicity try: report = reports[-1] except IndexError: print "Error: fjr file does not contain enough information" sys.exit(1) else: print "Error: no fjr provided" sys.exit(1) # ARGs parameters
def handleError(self,payload): """ The payload of a job failure is a url to the job report """ jobReportUrl= payload # prepare to retrieve the job report file. # NOTE: we assume that the report file has a relative unique name # NOTE: if that is not the case we need to add a unique identifier to it. slash = jobReportUrl.rfind('/') fileName = jobReportUrl[slash+1:] urllib.urlretrieve(jobReportUrl, \ self.args['jobReportLocation']+'/'+fileName) logging.debug(">RunFailureHandler<:Retrieving job report from %s " % jobReportUrl) jobReport=readJobReport(self.args['jobReportLocation']+'/'+fileName) #NOTE: is this the right way to extract the job id. jobId=jobReport[0].jobSpecId logging.debug(">RunFailureHandler<:Retrieving jobId from job report "+\ "(used to dynamically load error handler) " \ "jobId="+str(jobId)) # create the jobReportLocation jobId hierarchy if not exists. pipe=os.popen("mkdir -p "+self.args['jobReportLocation']+'/'+jobId) pipe.close() # move the report file to this new location. pipe=os.popen("mv "+self.args['jobReportLocation']+'/'+fileName+" "+ \ self.args['jobReportLocation']+'/'+jobId) logging.debug(">RunFailureHandler<:Moving job report to permanent storage: " \ +self.args['jobReportLocation']+'/'+jobId) pipe.close() reportLocation=self.args['jobReportLocation']+'/'+ \ jobId+'/'+fileName generalInfo=JobState.general(jobId) # a submit event with delay delay=int(self.args['DelayFactor'])*(int(generalInfo['Retries']+1)) delay=convertSeconds(delay) logging.debug(">RunFailureHandler<: re-submitting with delay (h:m:s) "+\ str(delay)) if self.args['ReportAction'] == 'move' : # count how many files are in the dir (to generate unique ids # when moving files try: lastID = len(os.listdir(os.path.dirname(payload))) target = os.path.join(os.path.dirname(payload),\ os.path.basename(payload).split('.')[0] +\ str(lastID) +\ '.xml') logging.debug('Moving file: '+ payload + ' to: ' + target) shutil.move(payload,target) except: pass try: JobState.runFailure(jobId,jobReportLocation= reportLocation) # check the cache dir size. If it is beyond the threshold, purge it. dirSizeBytes=dirSize(generalInfo['CacheDirLocation'],0,0,0) dirSizeMegaBytes=convertSize(dirSizeBytes,'m') logging.debug(">RunFailureHandler<:Cache dir. size is "+\ str(dirSizeMegaBytes)+" MB. Maximum allowed is "+\ str(self.maxCacheDirSizeMB)+" MB ") jobspecfile="%s/%s-JobSpec.xml" % (generalInfo['CacheDirLocation'],jobId) # if necessary first a partial cleanup is done, which after it # is finished publishes the proper event. # retrieve the number of retries and publish if(float(dirSizeMegaBytes)>float(self.maxCacheDirSizeMB)): newPayload=jobId+",SubmitJob,"+jobId+","+str(delay) logging.debug(">RunFailureHandler<: Reached maximum cache size. "+\ "Performing partial cache cleanup first.") self.publishEvent("PartialJobCleanup",newPayload,delay) else: logging.debug(">RunFailureHandler<:Registered "+\ "a job run failure,"\ "publishing a submit job event") if self.args['QueueFailures']: JobQueueAPI.reQueueJob(jobId) else: self.publishEvent("SubmitJob",jobspecfile,delay) except ProdException,ex: if(ex["ErrorNr"]==3013): logging.debug(">RunFailureHandler<:Registered "+\ "a job run failure "+ \ "Maximum number of retries reached!" +\ " Submitting a failure job and cleanup event ") JobState.failed(jobId) self.publishEvent("FailureCleanup",(jobId)) self.publishEvent("GeneralJobFailure",(jobId))
ApplicationVersion = value elif option == "--PSetHash": PSetHash = value elif option == "--SEName": SEName = value elif option == "--pfn-path": pfn_path = value elif option == "--lfn-path": lfn_path = value elif option == "--strip-input-file-info": strip_input_file_info = 1 else: sys.stderr.write("Unexpected option: " + str(option) + "\n") sys.exit(2) reports = readJobReport(inputReport) # report is an instance of FwkJobRep.FwkJobReport class # can be N in a file, so a list is always returned # here I am assuming just one report per file for simplicity if len(reports) <> 1: sys.stderr.write("ERROR: Found %d reports in " + inputReport + "\n" % len(reports)) sys.exit(1) report = reports[-1] if (len(report.files) == 0): print "no output file to modify" sys.exit(1) # CRAB requires this status == "Success"
def run(self): """ parse of all xml file on res dir and creation of distionary """ task = common._db.getTask() good_list=[] for job in task.getJobs(): fjr = self.fjrDirectory + job['outputFiles'][-1] if (job.runningJob['applicationReturnCode']!=0 or job.runningJob['wrapperReturnCode']!=0): continue # get FJR filename fjr = self.fjrDirectory + job['outputFiles'][-1] reports = readJobReport(fjr) if len(reports)>0: if reports[0].status == "Success": good_list.append(fjr) #################################################### if self.no_inp == 1: file_list = self.remove_input_from_fjr(good_list) else: file_list=good_list print "file_list = ", file_list #################################################### common.logger.log(10-1, "fjr with FrameworkJobReport Status='Success', file_list = "+str(file_list)) common.logger.log(10-1, "len(file_list) = "+str(len(file_list))) if (len(file_list)>0): BlocksList=[] common.logger.info("--->>> Start dataset publication") self.exit_status=self.publishDataset(file_list[0]) if (self.exit_status == '1'): return self.exit_status common.logger.info("--->>> End dataset publication") common.logger.info("--->>> Start files publication") for file in file_list: Blocks=self.publishAJobReport(file,self.processedData) if Blocks: for x in Blocks: # do not allow multiple entries of the same block if x not in BlocksList: BlocksList.append(x) # close the blocks common.logger.log(10-1, "BlocksList = %s"%BlocksList) dbswriter = DBSWriter(self.DBSURL) for BlockName in BlocksList: try: closeBlock=dbswriter.manageFileBlock(BlockName,maxFiles= 1) common.logger.log(10-1, "closeBlock %s"%closeBlock) except DBSWriterError, ex: common.logger.info("Close block error %s"%ex) if (len(self.noEventsFiles)>0): common.logger.info("--->>> WARNING: "+str(len(self.noEventsFiles))+" published files contain 0 events are:") for lfn in self.noEventsFiles: common.logger.info("------ LFN: %s"%lfn) if (len(self.noLFN)>0): common.logger.info("--->>> WARNING: there are "+str(len(self.noLFN))+" files not published because they have empty LFN") for pfn in self.noLFN: common.logger.info("------ pfn: %s"%pfn) if (len(self.problemFiles)>0): common.logger.info("--->>> WARNING: "+str(len(self.problemFiles))+" files not published because they had problem with copy to SE") for lfn in self.problemFiles: common.logger.info("------ LFN: %s"%lfn) common.logger.info("--->>> End files publication") #### FEDE for MULTI #### for dataset_to_check in self.published_datasets: self.cfg_params['USER.dataset_to_check']=dataset_to_check from InspectDBS import InspectDBS check=InspectDBS(self.cfg_params) check.checkPublication() ######################### return self.exit_status
def PrepareForDBS3Publish(self, good_list): from dbs.apis.dbsClient import DbsApi as Dbs3Api from ProdCommon.FwkJobRep.ReportParser import readJobReport # extract information from self and good_list and format as liked by publishInDBS3 function originSite = None (isDbs2, isDbs3, dbs2_url, dbs3_url) = verify_dbs_url(self) sourceUrl = dbs3_url prdDBSurl = 'https://cmsweb.cern.ch/dbs/prod/phys03/' intDBSurl = 'https://cmsweb-testbed.cern.ch/dbs/int/phys03/' devDBSurl = 'https://dbs3-dev01.cern.ch/dbs/dev/phys03/' devDBSurl = 'https://dbs3-testbed.cern.ch/dbs/dev/phys03/' destDBSurl = prdDBSurl if self.cfg_params.get('CMSSW.dbs3-int', None) == "1": destDBSurl = intDBSurl if self.cfg_params.get('CMSSW.dbs3-dev', None) == "1": destDBSurl = devDBSurl pubUrl = self.cfg_params.get('CMSSW.dbs-pub', None) if pubUrl: if not '/' in pubUrl: destDBSurl = 'https://' + pubUrl + '.cern.ch/dbs/dev/phys03/' else: destDBSurl = 'https://' + pubUrl + '/' common.logger.info('your dataset will be published in %s' % destDBSurl) destUrl = destDBSurl + 'DBSWriter' destReadUrl = destDBSurl + 'DBSReader' migrateUrl = destDBSurl + 'DBSMigrate' inputDataset = self.cfg_params.get('CMSSW.datasetpath', 'None') sourceApi = Dbs3Api(url=sourceUrl) # when looking up parents may need to look in global DBS as well globalUrl = sourceUrl # if this is not global, next lines will make it globalUrl = globalUrl.replace('phys01', 'global') globalUrl = globalUrl.replace('phys02', 'global') globalUrl = globalUrl.replace('phys03', 'global') globalUrl = globalUrl.replace('caf', 'global') globalApi = Dbs3Api(url=globalUrl) destinationApi = Dbs3Api(url=destUrl) # be safe, use RO Api unless really want to write destinationReadApi = Dbs3Api(url=destReadUrl) migrateApi = Dbs3Api(url=migrateUrl) # the publishInDBS3 copied from CRAB3 needs the info in the toPublish dictionary # format is: {outdataset:files} # outdataset = dataset to be published (full name /prim/proc/tier) # files is a list of dictionaries, one per LFN toPublish = {} common.logger.info("parsing crab_fjr files...") nfjr = 0 for crabFjr in good_list: # this is the list of FJR's in crab res fjr = readJobReport(crabFjr)[0] # parse into python nfjr += 1 if nfjr % 100 == 0: common.logger.info("parsed %d crab_fjr files..." % nfjr) if not fjr.files: msg = "WARNING: No EDM file to be published in %s" % crabFjr.split( '/')[-1] common.logger.info(msg) for outFile in fjr.files: # one fjr may have multiple output LFN's dset_info = outFile.dataset[ 0] # better there is only one dataset per file ! procds = dset_info['ProcessedDataset'] primds = dset_info['PrimaryDataset'] if primds == 'null': # user MC, get publishdatane stripping username and hash from procds primds = '-'.join(procds.split('-')[1:-1]) tier = dset_info['DataTier'] outdataset = "/%s/%s/%s" % (primds, procds, tier) if not toPublish.has_key(outdataset): toPublish[outdataset] = [] fileDic = {} # prepare dictionary to publish this LFN fileDic['cksum'] = outFile['Checksum'] fileDic['md5'] = "NOTSET" fileDic['adler32'] = "NOTSET" fileDic['acquisitionera'] = "null" fileDic['globaltag'] = "None" fileDic['publishname'] = procds fileDic['outdataset'] = outdataset fileDic['swversion'] = dset_info['ApplicationVersion'] fileDic['lfn'] = outFile['LFN'] fileDic['filesize'] = outFile['Size'] fileDic['runlumi'] = outFile['Runs'] if outFile['PNN']: origin = outFile['PNN'] else: origin = outFile['SEName'] # beware duplicate parents in FJR and parent='' when FJR had no LFN fileDic['parents'] = [ p for p in set(outFile.parentLFNs()) if p ] # there can be only one originSite in one block if originSite: if origin != originSite: msg = "ERROR: not all files to be published have same location" msg += "file %s has origin %s, while previous files have %s\n" %\ (outFile['LFN'], origin, originSite) raise CrabException(msg) else: originSite = origin fileDic['inevents'] = outFile['TotalEvents'] toPublish[outdataset].append( fileDic) # add dictionary to files list # all done common.logger.info("parsing of crab_fjr files completed") argsForDbs3 = { \ 'sourceApi' : sourceApi, 'globalApi' : globalApi, 'inputDataset' : inputDataset, 'toPublish' : toPublish, 'destApi' : destinationApi, 'destReadApi' : destinationReadApi, 'migrateApi' : migrateApi, 'origin_site_name' : originSite } return argsForDbs3
def parseFinalReport(self, input): """ Parses the FJR produced by job in order to retrieve the WrapperExitCode and ExeExitCode. Updates the BossDB with these values. """ from ProdCommon.FwkJobRep.ReportParser import readJobReport codeValue = {} jreports = readJobReport(input) if len(jreports) <= 0: codeValue["applicationReturnCode"] = str(50115) codeValue["wrapperReturnCode"] = str(50115) common.logger.debug( "Empty FWkobreport: error code assigned is 50115 ") return codeValue jobReport = jreports[0] exit_status = '' ##### temporary fix for FJR incomplete #### fjr = open(input) len_fjr = len(fjr.readlines()) if (len_fjr <= 6): ### 50115 - cmsRun did not produce a valid/readable job report at runtime codeValue["applicationReturnCode"] = str(50115) codeValue["wrapperReturnCode"] = str(50115) if len(jobReport.errors) != 0: for error in jobReport.errors: if error['Type'] == 'WrapperExitCode': codeValue["wrapperReturnCode"] = error['ExitStatus'] elif error['Type'] == 'ExeExitCode': codeValue["applicationReturnCode"] = error['ExitStatus'] if error['Type'] == 'CMSException': codeValue["applicationReturnCodeOrig"] = error[ 'ExitStatus'] else: continue if not codeValue.has_key('wrapperReturnCode'): codeValue["wrapperReturnCode"] = '' if not codeValue.has_key('applicationReturnCode'): if codeValue.has_key('applicationReturnCodeOrig'): codeValue["applicationReturnCode"] = \ codeValue["applicationReturnCodeOrig"] codeValue.pop("applicationReturnCodeOrig") else: codeValue["applicationReturnCode"] = '' else: if codeValue.has_key('applicationReturnCodeOrig'): codeValue.pop("applicationReturnCodeOrig") #### Filling BOSS DB with SE name and LFN, for edm and not_edm files #### lfns = [] pfns = [] if (len(jobReport.files) != 0): for f in jobReport.files: if f['LFN']: lfns.append(f['LFN']) if f['PFN']: #### FEDE to have the correct endpoit to use in the copyData (we modify the bossDB value and not the fjr ) if common.scheduler.name().upper() not in [ 'LSF', 'CAF', 'PBS', 'PBSV2' ] and codeValue["wrapperReturnCode"] == 60308: pfns.append(os.path.dirname(f['SurlForGrid']) + '/') else: pfns.append(os.path.dirname(f['PFN']) + '/') ########## if (len(jobReport.analysisFiles) != 0): for aFile in jobReport.analysisFiles: if aFile['LFN']: lfns.append(aFile['LFN']) if aFile['PFN']: #### FEDE to have the correct endpoit to use in the copyData (we modify the bossDB value and not the fjr ) if common.scheduler.name().upper() not in [ 'LSF', 'CAF', 'PBS', 'PBSV2' ] and codeValue["wrapperReturnCode"] == 60308: pfns.append( os.path.dirname(aFile['SurlForGrid']) + '/') else: pfns.append(os.path.dirname(aFile['PFN']) + '/') ######### codeValue["storage"] = pfns codeValue["lfn"] = lfns return codeValue
def __call__(self, filesetToProcess): """ The algorithm itself """ # Get configuration initObj = WMInit() initObj.setLogging() initObj.setDatabaseConnection(os.getenv("DATABASE"), \ os.getenv('DIALECT'), os.getenv("DBSOCK")) myThread = threading.currentThread() daofactory = DAOFactory(package = "WMCore.WMBS" , \ logger = myThread.logger, \ dbinterface = myThread.dbi) lastFileset = daofactory(classname = "Fileset.ListFilesetByTask") lastWorkflow = daofactory(classname = "Workflow.LoadFromTask") subsRun = daofactory(\ classname = "Subscriptions.LoadFromFilesetWorkflow") successJob = daofactory(classname = "Subscriptions.SucceededJobs") allJob = daofactory(classname = "Subscriptions.Jobs") fileInFileset = daofactory(classname = "Files.InFileset") # Get the start Run if asked startRun = (filesetToProcess.name).split(":")[3] logging.debug("the T0Feeder is processing %s" % \ filesetToProcess.name) logging.debug("the fileset name %s" % \ (filesetToProcess.name).split(":")[0]) fileType = (filesetToProcess.name).split(":")[2] crabTask = filesetToProcess.name.split(":")[0] LASTIME = filesetToProcess.lastUpdate tries = 1 while True: try: myRequester = JSONRequests(url = "vocms52.cern.ch:8889") requestResult = myRequester.get("/tier0/runs") except: logging.debug("T0Reader call error...") if tries == self.maxRetries: return else: tries += 1 continue logging.debug("T0ASTRunChain feeder queries done ...") now = time.time() break for listRun in requestResult[0]: if startRun != 'None' and int(listRun['run']) >= int(startRun): if listRun['status'] =='CloseOutExport' or listRun\ ['status']=='Complete' or listRun['status']=='CloseOutT1Skimming': crabWorkflow = lastWorkflow.execute(task=crabTask) crabFileset = lastFileset.execute\ (task=crabTask) crabrunFileset = Fileset(\ name = crabFileset[0]["name"].split(':')[0].split\ ('-Run')[0]+ '-Run' + str(listRun['run']) + ":" + \ ":".join(crabFileset[0]['name'].split(':')[1:]) ) if crabrunFileset.exists() > 0: crabrunFileset.load() currSubs = subsRun.execute\ (crabrunFileset.id, crabWorkflow[0]['id']) if currSubs: listsuccessJob = successJob.execute(\ subscription=currSubs['id']) listallJob = allJob.execute(\ subscription=currSubs['id']) if len(listsuccessJob) == len(listallJob): for currid in listsuccessJob: currjob = Job( id = currid ) currjob.load() logging.debug("Reading FJR %s" %currjob['fwjr_path']) jobReport = readJobReport(currjob['fwjr_path']) if len(jobReport) > 0: if jobReport[0].files: for newFile in jobReport[0].files: logging.debug(\ "Output path %s" %newFile['LFN']) newFileToAdd = File(\ lfn=newFile['LFN'], locations ='caf.cern.ch') LOCK.acquire() if newFileToAdd.exists\ () == False : newFileToAdd.create() else: newFileToAdd.loadData() LOCK.release() listFile = \ fileInFileset.execute(filesetToProcess.id) if {'fileid': \ newFileToAdd['id']} not in listFile: filesetToProcess.addFile(\ newFileToAdd) filesetToProcess\ .setLastUpdate(now) filesetToProcess.commit() logging.debug(\ "new file created/loaded and added by T0ASTRunChain...") elif jobReport[0].analysisFiles: for newFile in jobReport\ [0].analysisFiles: logging.debug(\ "Ouput path %s " %newFile['LFN']) newFileToAdd = File(\ lfn=newFile['LFN'], locations ='caf.cern.ch') LOCK.acquire() if newFileToAdd.exists\ () == False : newFileToAdd.create() else: newFileToAdd.loadData() LOCK.release() listFile = \ fileInFileset.execute(filesetToProcess.id) if {'fileid': newFileToAdd\ ['id']} not in listFile: logging.debug\ ("%s loaded and added by T0ASTRunChain" %newFile['LFN']) filesetToProcess.addFile\ (newFileToAdd) filesetToProcess.\ setLastUpdate(now) filesetToProcess.commit() logging.debug(\ "new file created/loaded added by T0ASTRunChain...") else: break #Missed fjr - Try next time # Commit the fileset logging.debug("Test purge in T0ASTRunChain ...") filesetToProcess.load() LASTIME = filesetToProcess.lastUpdate # For re-opned fileset or empty, try until the purge time if (int(now)/3600 - LASTIME/3600) > self.reopenTime: filesetToProcess.setLastUpdate(time.time()) filesetToProcess.commit() if (int(now)/3600 - LASTIME/3600) > self.purgeTime: filesetToProcess.markOpen(False) logging.debug("Purge Done...")
def run(self): """ The main method of the class: report status of a task """ common.logger.debug( "Reporter::run() called") task = common._db.getTask() msg= "--------------------\n" msg += "Dataset: %s\n"%str(task['dataset']) if self.cfg_params.has_key('USER.copy_data') and int(self.cfg_params['USER.copy_data'])==1: msg+= "Remote output :\n" ## TODO: SL should come from jobDB! from PhEDExDatasvcInfo import PhEDExDatasvcInfo stageout = PhEDExDatasvcInfo(self.cfg_params) endpoint, lfn, SE, SE_PATH, user = stageout.getEndpoint() #print endpoint, lfn, SE, SE_PATH, user msg+= "SE: %s %s srmPath: %s\n"%(self.cfg_params['USER.storage_element'],SE,endpoint) else: msg += "Local output: %s\n" % task['outputDirectory'] #print task possible_status = [ 'Created', 'Undefined', 'Submitting', 'Submitted', 'NotSubmitted', 'Waiting', 'Ready', 'Scheduled', 'Running', 'Done', 'Killing', 'Killed', 'Aborted', 'Unknown', 'Done (Failed)', 'Cleared', 'Retrieved' ] eventsRead=0 eventsRequired=0 filesRead=0 filesRequired=0 lumis = [] for job in task.getJobs(): if (job.runningJob['applicationReturnCode']!=0 or job.runningJob['wrapperReturnCode']!=0): continue # get FJR filename fjr = self.fjrDirectory + job['outputFiles'][-1] jobReport = readJobReport(fjr) if len(jobReport) > 0: inputFiles = jobReport[0].inputFiles for inputFile in inputFiles: # Accumulate the list of lum sections run over for run in inputFile.runs.keys(): for lumi in inputFile.runs[run]: lumis.append((run, lumi)) filesRead+=1 eventsRead+=int(inputFile['EventsRead']) #print jobReport[0].inputFiles,'\n' else: pass #print 'no FJR avaialble for job #%s'%job['jobId'] #print "--------------------------" # Compact and write the list of successful lumis lumiList = LumiList(lumis = lumis) compactList = lumiList.getCompactList() lumiFilename = task['outputDirectory'] + 'lumiSummary.json' lumiSummary = open(lumiFilename, 'w') json.dump(compactList, lumiSummary) lumiSummary.write('\n') lumiSummary.close() msg += "Total Events read: %s\n" % eventsRead msg += "Total Files read: %s\n" % filesRead msg += "Total Jobs : %s\n" % len(task.getJobs()) msg += "Luminosity section summary file: %s\n" % lumiFilename list_ID={} # TEMPORARY by Fabio, to be removed # avoid clashes between glite_slc5 and glite schedulers when a server is used # otherwise, -report with a server requires a local scheduler if self.cfg_params.get('CRAB.server_name', None) is None: common.logger.debug( "Reporter updating task status") task = common.scheduler.queryEverything(task['id']) for st in possible_status: list_ID = common._db.queryAttrRunJob({'statusScheduler':st},'jobId') if (len(list_ID)>0): msg+= " # Jobs: %s:%s\n"%(str(st),len(list_ID)) pass msg+= "\n----------------------------\n" common.logger.info(msg) file = common.work_space.shareDir() + 'arguments.xml' #print "file = ", file ### starting from the arguments.xml file, a json file containing the run:lumi ### that should be analyzed with the task inputRunLumiFileName = self.getInputRunLumi(file) ### missing lumi to analyze: starting from lumimask or from argument file ### calculate the difference with report.json ### if a lumimask is used in the crab.cfg if (self.cfg_params.get('CMSSW.lumi_mask')): lumimask=self.cfg_params.get('CMSSW.lumi_mask') #print "lumimask = ", lumimask self.compareJsonFile(lumimask) ### without lumimask elif (inputRunLumiFileName): self.compareJsonFile(inputRunLumiFileName) else: common.logger.info("No json file to compare") return
def run(self): """ The main method of the class: report status of a task """ common.logger.debug( "Reporter::run() called") task = common._db.getTask() msg= "--------------------\n" msg += "Dataset: %s\n"%str(task['dataset']) if self.cfg_params.has_key('USER.copy_data') and int(self.cfg_params['USER.copy_data'])==1: msg+= "Remote output :\n" ## TODO: SL should come from jobDB! from PhEDExDatasvcInfo import PhEDExDatasvcInfo stageout = PhEDExDatasvcInfo(self.cfg_params) endpoint, PNN, lfn, SE, SE_PATH, user = stageout.getEndpoint() #print endpoint, lfn, SE, SE_PATH, user msg+= "SE: %s %s srmPath: %s\n"%(self.cfg_params['USER.storage_element'],SE,endpoint) else: msg += "Local output: %s\n" % task['outputDirectory'] #print task possible_status = [ 'Created', 'Undefined', 'Submitting', 'Submitted', 'NotSubmitted', 'Waiting', 'Ready', 'Scheduled', 'Running', 'Done', 'Killing', 'Killed', 'Aborted', 'Unknown', 'Done (Failed)', 'Cleared', 'Retrieved' ] eventsRead=0 eventsRequired=0 filesRead=0 filesRequired=0 lumis = [] for job in task.getJobs(): if (job.runningJob['applicationReturnCode']!=0 or job.runningJob['wrapperReturnCode']!=0): continue # get FJR filename fjr = self.fjrDirectory + job['outputFiles'][-1] jobReport = readJobReport(fjr) if len(jobReport) > 0: inputFiles = jobReport[0].inputFiles for inputFile in inputFiles: # Accumulate the list of lum sections run over for run in inputFile.runs.keys(): for lumi in inputFile.runs[run]: lumis.append((run, lumi)) filesRead+=1 eventsRead+=int(inputFile['EventsRead']) #print jobReport[0].inputFiles,'\n' else: pass #print 'no FJR avaialble for job #%s'%job['jobId'] #print "--------------------------" # Compact and write the list of successful lumis lumiList = LumiList(lumis = lumis) compactList = lumiList.getCompactList() lumiFilename = task['outputDirectory'] + 'lumiSummary.json' lumiSummary = open(lumiFilename, 'w') json.dump(compactList, lumiSummary) lumiSummary.write('\n') lumiSummary.close() msg += "Total Events read: %s\n" % eventsRead msg += "Total Files read: %s\n" % filesRead msg += "Total Jobs : %s\n" % len(task.getJobs()) msg += "Luminosity section summary file: %s\n" % lumiFilename list_ID={} for st in possible_status: list_ID = common._db.queryAttrRunJob({'statusScheduler':st},'jobId') if (len(list_ID)>0): msg+= " # Jobs: %s:%s\n"%(str(st),len(list_ID)) pass msg+= "\n----------------------------\n" common.logger.info(msg) file = common.work_space.shareDir() + 'arguments.xml' ### starting from the arguments.xml file, a json file containing the run:lumi ### that should be analyzed with the task inputRunLumiFileName = self.getInputRunLumi(file) ### missing lumi to analyze: starting from argument file ### calculate the difference with report.json if not inputRunLumiFileName or not os.path.isfile(inputRunLumiFileName) or os.path.getsize(inputRunLumiFileName) == 0 : common.logger.info("No Lumi file to compare") else: self.compareJsonFile(inputRunLumiFileName, 'task_missingLumiSummary.json') return
def publishDataset(self, file): """ """ try: jobReport = readJobReport(file)[0] self.exit_status = '0' except IndexError: self.exit_status = '1' msg = "Error: Problem with " + file + " file" common.logger.info(msg) return self.exit_status if (len(self.dataset_to_import) != 0): for dataset in self.dataset_to_import: common.logger.info( "--->>> Importing parent dataset in the dbs: " + dataset) status_import = self.importParentDataset( self.globalDBS, dataset) if (status_import == 1): common.logger.info('Problem with parent ' + dataset + ' import from the global DBS ' + self.globalDBS + 'to the local one ' + self.DBSURL) self.exit_status = '1' return self.exit_status else: common.logger.info('Import ok of dataset ' + dataset) if (len(jobReport.files) <= 0): self.exit_status = '1' msg = "Error: No EDM file to publish in xml file" + file + " file" common.logger.info(msg) return self.exit_status else: msg = "fjr contains some files to publish" common.logger.debug(msg) #### datasets creation in dbs #// DBS to contact write and read of the same dbs dbsReader = DBSReader(self.DBSURL, level='ERROR') dbswriter = DBSWriter(self.DBSURL) ##### self.published_datasets = [] for fileinfo in jobReport.files: datasets_info = fileinfo.dataset if len(datasets_info) <= 0: self.exit_status = '1' msg = "Error: No info about dataset in the xml file " + file common.logger.info(msg) return self.exit_status else: for dataset in datasets_info: #### for production data self.processedData = dataset['ProcessedDataset'] if (dataset['PrimaryDataset'] == 'null'): dataset['PrimaryDataset'] = self.userprocessedData elif self.datasetpath.upper() != 'NONE': dataset['ParentDataset'] = self.datasetpath dataset['PSetContent'] = self.content cfgMeta = { 'name': self.pset, 'Type': 'user', 'annotation': 'user cfg', 'version': 'private version' } # add real name of user cfg common.logger.info("PrimaryDataset = %s" % dataset['PrimaryDataset']) common.logger.info("ProcessedDataset = %s" % dataset['ProcessedDataset']) common.logger.info("<User Dataset Name> = /" + dataset['PrimaryDataset'] + "/" + dataset['ProcessedDataset'] + "/USER") self.dataset_to_check = "/" + dataset[ 'PrimaryDataset'] + "/" + dataset[ 'ProcessedDataset'] + "/USER" self.published_datasets.append(self.dataset_to_check) common.logger.log( 10 - 1, "--->>> Inserting primary: %s processed : %s" % (dataset['PrimaryDataset'], dataset['ProcessedDataset'])) #### check if dataset already exists in the DBS result = dbsReader.matchProcessedDatasets( dataset['PrimaryDataset'], 'USER', dataset['ProcessedDataset']) if (len(result) != 0): result = dbsReader.listDatasetFiles( self.dataset_to_check) primary = DBSWriterObjects.createPrimaryDataset( dataset, dbswriter.dbs) common.logger.log(10 - 1, "Primary: %s " % primary) print "primary = ", primary algo = DBSWriterObjects.createAlgorithm( dataset, cfgMeta, dbswriter.dbs) common.logger.log(10 - 1, "Algo: %s " % algo) processed = DBSWriterObjects.createProcessedDataset( primary, algo, dataset, dbswriter.dbs) common.logger.log(10 - 1, "Processed: %s " % processed) print "processed = ", processed common.logger.log( 10 - 1, "Inserted primary %s processed %s" % (primary, processed)) ####################################################################################### common.logger.log(10 - 1, "exit_status = %s " % self.exit_status) return self.exit_status
def ExtractPerformanceInformation(archive, ExtractDir, debug): """ open archive in ExtractDir and extract performance information """ # extract the FrameworkJobReport.xml if os.path.isfile(archive) and tarfile.is_tarfile(archive): jobtarfile = tarfile.open(archive, 'r:gz') reportFileList = [ tf for tf in jobtarfile.getnames() if tf.count("FrameworkJobReport.xml") ] if len(reportFileList) > 0: reportFile = reportFileList[0] results = [] ### HERE! result = {} jobtarfile.extract(reportFile, ExtractDir) reportfilename = os.path.join(ExtractDir, reportFile) if os.path.isfile(reportfilename): if debug: print 'Opening FrameworkJobReport:', reportfilename, 'from archive:', archive reports = readJobReport(reportfilename) for report in reports: performanceReport = report.performance # average physical memory try: mem = performanceReport.summaries['RSSMemory'][ 'AvgRSSMemory'] result['AverageMemory'] = float(mem) except: print 'Cannot read RSSMemory:AvgRSSMemory from FrameworkJobReport.xml from', archive result['AverageMemory'] = 0. # max physical memory try: mem = performanceReport.summaries['RSSMemory'][ 'MaxRSSMemory'] result['MaxMemory'] = float(mem) except: print 'Cannot read RSSMemory:MaxRSSMemory from FrameworkJobReport.xml from', archive result['MaxMemory'] = 0. # events per job from trigger report try: events = performanceReport.summaries['TrigReport'][ 'TotalEvents'] result['Events'] = int(events) except: try: events = 0 for inputFile in report.inputFiles: events += int(inputFile['EventsRead']) result['Events'] = events except: print 'Cannot read TrigReport:TotalEvents from FrameworkJobReport.xml from', archive result['Events'] = 0 # time per event try: time = performanceReport.summaries['Timing'][ 'RealPerEvent'] result['TimePerEvent'] = float(time) except: try: timePerJob = float( report.timing['AppEndTime']) - float( report.timing['AppStartTime']) result['TimePerEvent'] = timePerJob / float( result['Events']) except: print 'Cannot read Timing:RealPerEvent from FrameworkJobReport.xml from', archive result['TimePerEvent'] = 0 results.append(result) ### HERE! result = {} ### HERE! #return result return results ### HERE! else: print 'Archive:', archive, 'does not contain a FrameworkJobReport.xml, skipping!' return None else: print 'Archive:', archive, 'is not a gzipp\'ed tarball, skipping!' return None
def __call__(self, filesetToProcess): """ The algorithm itself """ # Get configuration initObj = WMInit() initObj.setLogging() initObj.setDatabaseConnection(os.getenv("DATABASE"), \ os.getenv('DIALECT'), os.getenv("DBSOCK")) myThread = threading.currentThread() daofactory = DAOFactory(package = "WMCore.WMBS" , \ logger = myThread.logger, \ dbinterface = myThread.dbi) lastFileset = daofactory(classname="Fileset.ListFilesetByTask") lastWorkflow = daofactory(classname="Workflow.LoadFromTask") subsRun = daofactory(\ classname = "Subscriptions.LoadFromFilesetWorkflow") successJob = daofactory(classname="Subscriptions.SucceededJobs") allJob = daofactory(classname="Subscriptions.Jobs") fileInFileset = daofactory(classname="Files.InFileset") # Get the start Run if asked startRun = (filesetToProcess.name).split(":")[3] logging.debug("the T0Feeder is processing %s" % \ filesetToProcess.name) logging.debug("the fileset name %s" % \ (filesetToProcess.name).split(":")[0]) fileType = (filesetToProcess.name).split(":")[2] crabTask = filesetToProcess.name.split(":")[0] LASTIME = filesetToProcess.lastUpdate tries = 1 while True: try: myRequester = JSONRequests(url="vocms52.cern.ch:8889") requestResult = myRequester.get("/tier0/runs") except: logging.debug("T0Reader call error...") if tries == self.maxRetries: return else: tries += 1 continue logging.debug("T0ASTRunChain feeder queries done ...") now = time.time() break for listRun in requestResult[0]: if startRun != 'None' and int(listRun['run']) >= int(startRun): if listRun['status'] =='CloseOutExport' or listRun\ ['status']=='Complete' or listRun['status']=='CloseOutT1Skimming': crabWorkflow = lastWorkflow.execute(task=crabTask) crabFileset = lastFileset.execute\ (task=crabTask) crabrunFileset = Fileset(\ name = crabFileset[0]["name"].split(':')[0].split\ ('-Run')[0]+ '-Run' + str(listRun['run']) + ":" + \ ":".join(crabFileset[0]['name'].split(':')[1:]) ) if crabrunFileset.exists() > 0: crabrunFileset.load() currSubs = subsRun.execute\ (crabrunFileset.id, crabWorkflow[0]['id']) if currSubs: listsuccessJob = successJob.execute(\ subscription=currSubs['id']) listallJob = allJob.execute(\ subscription=currSubs['id']) if len(listsuccessJob) == len(listallJob): for currid in listsuccessJob: currjob = Job(id=currid) currjob.load() logging.debug("Reading FJR %s" % currjob['fwjr_path']) jobReport = readJobReport( currjob['fwjr_path']) if len(jobReport) > 0: if jobReport[0].files: for newFile in jobReport[0].files: logging.debug(\ "Output path %s" %newFile['LFN']) newFileToAdd = File(\ lfn=newFile['LFN'], locations ='caf.cern.ch') LOCK.acquire() if newFileToAdd.exists\ () == False : newFileToAdd.create() else: newFileToAdd.loadData() LOCK.release() listFile = \ fileInFileset.execute(filesetToProcess.id) if {'fileid': \ newFileToAdd['id']} not in listFile: filesetToProcess.addFile(\ newFileToAdd) filesetToProcess\ .setLastUpdate(now) filesetToProcess.commit() logging.debug(\ "new file created/loaded and added by T0ASTRunChain...") elif jobReport[0].analysisFiles: for newFile in jobReport\ [0].analysisFiles: logging.debug(\ "Ouput path %s " %newFile['LFN']) newFileToAdd = File(\ lfn=newFile['LFN'], locations ='caf.cern.ch') LOCK.acquire() if newFileToAdd.exists\ () == False : newFileToAdd.create() else: newFileToAdd.loadData() LOCK.release() listFile = \ fileInFileset.execute(filesetToProcess.id) if {'fileid': newFileToAdd\ ['id']} not in listFile: logging.debug\ ("%s loaded and added by T0ASTRunChain" %newFile['LFN']) filesetToProcess.addFile\ (newFileToAdd) filesetToProcess.\ setLastUpdate(now) filesetToProcess.commit() logging.debug(\ "new file created/loaded added by T0ASTRunChain...") else: break #Missed fjr - Try next time # Commit the fileset logging.debug("Test purge in T0ASTRunChain ...") filesetToProcess.load() LASTIME = filesetToProcess.lastUpdate # For re-opned fileset or empty, try until the purge time if (int(now) / 3600 - LASTIME / 3600) > self.reopenTime: filesetToProcess.setLastUpdate(time.time()) filesetToProcess.commit() if (int(now) / 3600 - LASTIME / 3600) > self.purgeTime: filesetToProcess.markOpen(False) logging.debug("Purge Done...")