def testNoLocationFile(self): """ _testNoLocationFile_ Check how we deal with output files without a valid location """ jobReport = Report() jobReport.load(self.noLocationReport) fileList = jobReport.getAllFiles() self.assertEqual(len(fileList), 2) self.assertItemsEqual(fileList[0]['locations'], {}) self.assertEqual(fileList[0]['outputModule'], "RAWSIMoutput") self.assertItemsEqual(fileList[1]['locations'], {"T2_CH_CSCS"}) self.assertEqual(fileList[1]['outputModule'], "logArchive")
def parse_fwk_report(data, config, report_filename): """Extract task data from a framework report. Analyze the CMSSW job framework report to get the CMSSW exit code, skipped files, runs and lumis processed on a file basis, total events written, and CPU time overall and per event. """ exit_code = 0 skipped = [] infos = {} written = 0 eventsPerRun = 0 report = Report("cmsrun") report.parse(report_filename) exit_code = report.getExitCode() for fn in report.getAllSkippedFiles(): fn = config['file map'].get(fn, fn) skipped.append(fn) outinfos = {} for file in report.getAllFiles(): pfn = file['pfn'] outinfos[pfn] = { 'runs': {}, 'events': file['events'], } written += int(file['events']) for run in file['runs']: try: outinfos[pfn]['runs'][run.run].extend(run.lumis) except KeyError: outinfos[pfn]['runs'][run.run] = run.lumis for file in report.getAllInputFiles(): filename = file['lfn'] if len(file['lfn']) > 0 else file['pfn'] filename = config['file map'].get(filename, filename) file_lumis = [] try: for run in file['runs']: for lumi in run.lumis: file_lumis.append((run.run, lumi)) except AttributeError: logger.info('Detected file-based task') infos[filename] = (int(file['events']), file_lumis) eventsPerRun += infos[filename][0] serialized = report.__to_json__(None) cputime = float(serialized['steps']['cmsrun']['performance']['cpu'].get('TotalJobCPU', '0')) data['files']['info'] = infos data['files']['output_info'] = outinfos data['files']['skipped'] = skipped data['events_written'] = written data['exe_exit_code'] = exit_code # For efficiency, we care only about the CPU time spent processing # events data['cpu_time'] = cputime data['events_per_run'] = eventsPerRun
def main(): """ _main_ """ if 'WMAGENT_CONFIG' not in os.environ: os.environ['WMAGENT_CONFIG'] = '/data/srv/wmagent/current/config/wmagent/config.py' if 'manage' not in os.environ: os.environ['manage'] = '/data/srv/wmagent/current/config/wmagent/manage' ### Fetch the report pickle files from the component log command = ["tail", "-n1000", "install/wmagent/JobAccountant/ComponentLog"] p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() logFiles = [line for line in out.splitlines() if 'install/wmagent/JobCreator/JobCache' in line] logFiles = [i.split()[2] for i in logFiles] msg = "Found %d pickle files to parse " % len(logFiles) ### Now unpickle each of these files and get their output files # also check whether any of them are duplicate lfn2PklDict = {} dupOutputPkl = {} # string value with the dup LFN and keyed by the pickle file path jobReport = Report() for pklPath in logFiles: if not os.path.exists(pklPath): continue jobReport.load(pklPath) for e in jobReport.getAllFiles(): lfn2PklDict.setdefault(e['lfn'], []) lfn2PklDict[e['lfn']].append(pklPath) # now check which files contain more than one pickle path (= created by diff jobs) dupFiles = [] for lfn, pkls in lfn2PklDict.iteritems(): if len(pkls) > 1: dupFiles.append(lfn) for pkl in pkls: if pkl not in dupOutputPkl: jobReport.load(pkl) dupOutputPkl[pkl] = jobReport.__to_json__(None) dupOutputPkl[pkl]['dup_lfns'] = [] dupOutputPkl[pkl]['dup_lfns'].append(lfn) msg += "with a total of %d output files and %d duplicated" % (len(lfn2PklDict), len(dupFiles)) msg += " files to process among them." msg += "\nDuplicate files are:\n%s" % dupFiles print(msg) if dupFiles: print("See dupPickles.json for further details ...") with open('dupPickles.json', 'w') as fo: json.dump(dupOutputPkl, fo, indent=2) if dupFiles: var = raw_input("Can we automatically delete those pickle files? Y/N\n") if var == "Y": # then delete all job report files but the first one - NOT ideal for fname in dupFiles: for pklFile in lfn2PklDict[fname][1:]: if os.path.isfile(pklFile): print("Deleting %s ..." % pklFile) os.remove(pklFile) else: print(" File has probably been already deleted %s ..." % pklFile) print(" Done!") ### Time to load all - this is BAD - LFNs from WMBS database print("\nNow loading all LFNs from wmbs_file_details ...") connectToDB() myThread = threading.currentThread() formatter = DBFormatter(logging, myThread.dbi) output = myThread.transaction.processData("SELECT lfn FROM wmbs_file_details") lfnsDB = formatter.format(output) lfnsDB = [item[0] for item in lfnsDB] print("Retrieved %d lfns from wmbs_file_details" % len(lfnsDB)) ### Compare what are the duplicates dupFiles = list(set(lfn2PklDict.keys()) & set(lfnsDB)) print("\nFound %d duplicate files." % len(dupFiles)) if len(dupFiles) == 0: sys.exit(0) ### Print some basic data about these reports print("Their overview is: ") dbDupPkl = [] for fname in dupFiles: for pklPath in lfn2PklDict[fname]: jobInfo = {'lfn': fname} jobInfo['pklPath'] = pklPath jobReport.load(pklPath) jobInfo['exitCode'] = jobReport.getExitCode() jobInfo['taskSuccess'] = jobReport.taskSuccessful() jobInfo['EOSLogURL'] = jobReport.getLogURL() jobInfo['HostName'] = jobReport.getWorkerNodeInfo()['HostName'] jobInfo['Site'] = jobReport.getSiteName() jobInfo['task'] = jobReport.getTaskName() dbDupPkl.append(jobInfo) print(pformat(dbDupPkl)) print("") print("Remove them, restart the component and be happy!\n") sys.exit(0)