def handleException(exitAcronym, exitCode, exitMsg): #first save the traceback before it gets overwritten by other tracebacks (e.g.: wrong jobReport) formatted_tb = traceback.format_exc() report = {} try: if os.path.exists("jobReport.json"): report = json.load(open("jobReport.json")) else: print( "WARNING: WMCore did not produce a jobReport.json; FJR will not be useful." ) except: print( "WARNING: Unable to parse WMCore's jobReport.json; FJR will not be useful.\n", traceback.format_exc()) if report.get('steps', {}).get('cmsRun', {}).get('errors'): exitMsg += '\nCMSSW error message follows.\n' for error in report['steps']['cmsRun']['errors']: if 'exitCode' in error: try: exitCode = int(exitCode) fjrExitCode = int(error['exitCode']) if (fjrExitCode % 256 == exitCode) and (fjrExitCode != exitCode): print( "NOTE: FJR has exit code %d and WMCore reports %d; preferring the FJR one." % (fjrExitCode, exitCode)) exitCode = fjrExitCode except ValueError: pass exitMsg += error['type'] + '\n' exitMsg += error['details'] + '\n' report['exitAcronym'] = exitAcronym report['exitCode'] = exitCode report['exitMsg'] = exitMsg print("ERROR: Exceptional exit at %s (%s): %s" % (time.asctime(time.gmtime()), str(exitCode), str(exitMsg))) if not formatted_tb.startswith("None"): print("ERROR: Traceback follows:\n", formatted_tb) try: slc = SiteLocalConfig.loadSiteLocalConfig() report['executed_site'] = slc.siteName print( "== Execution site for failed job from site-local-config.xml: %s" % slc.siteName) except: print( "ERROR: Failed to record execution site name in the FJR from the site-local-config.xml" ) print(traceback.format_exc()) with open('jobReport.json', 'w') as of: json.dump(report, of) if ad and not "CRAB3_RUNTIME_DEBUG" in os.environ: stopDashboardMonitoring(ad)
def handleException(exitAcronym, exitCode, exitMsg): #first save the traceback before it gets overwritten by other tracebacks (e.g.: wrong jobReport) formatted_tb = traceback.format_exc() report = {} try: if os.path.exists("jobReport.json"): report = json.load(open("jobReport.json")) else: print("WARNING: WMCore did not produce a jobReport.json; FJR will not be useful.") except: print("WARNING: Unable to parse WMCore's jobReport.json; FJR will not be useful.\n", traceback.format_exc()) if report.get('steps', {}).get('cmsRun', {}).get('errors'): exitMsg += '\nCMSSW error message follows.\n' for error in report['steps']['cmsRun']['errors']: if 'exitCode' in error: try: exitCode = int(exitCode) fjrExitCode = int(error['exitCode']) if (fjrExitCode % 256 == exitCode) and (fjrExitCode != exitCode): print("NOTE: FJR has exit code %d and WMCore reports %d; preferring the FJR one." % (fjrExitCode, exitCode)) exitCode = fjrExitCode except ValueError: pass exitMsg += error['type'] + '\n' exitMsg += error['details'] + '\n' report['exitAcronym'] = exitAcronym report['exitCode'] = exitCode report['exitMsg'] = exitMsg print("ERROR: Exceptional exit at %s (%s): %s" % (time.asctime(time.gmtime()), str(exitCode), str(exitMsg))) if not formatted_tb.startswith("None"): print("ERROR: Traceback follows:\n", formatted_tb) try: slc = SiteLocalConfig.loadSiteLocalConfig() report['executed_site'] = slc.siteName print("== Execution site for failed job from site-local-config.xml: %s" % slc.siteName) except: print("ERROR: Failed to record execution site name in the FJR from the site-local-config.xml") print(traceback.format_exc()) with open('jobReport.json', 'w') as of: json.dump(report, of) if ad and not "CRAB3_RUNTIME_DEBUG" in os.environ: stopDashboardMonitoring(ad)
handleException("FAILED", EC_PsetHash, "Unable to compute pset hash for job output") mintime() sys.exit(EC_PsetHash) if jobExitCode: #TODO check exitcode from fwjr report['exitAcronym'] = "FAILED" report['exitCode'] = jobExitCode report['exitMsg'] = "Error while running CMSSW:\n" for error in report['steps']['cmsRun']['errors']: report['exitMsg'] += error['type'] + '\n' report['exitMsg'] += error['details'] + '\n' else: report['exitAcronym'] = "OK" report['exitCode'] = 0 report['exitMsg'] = "OK" slc = SiteLocalConfig.loadSiteLocalConfig() report['executed_site'] = slc.siteName if 'phedex-node' in slc.localStageOut: report['phedex_node'] = slc.localStageOut['phedex-node'] print("== Execution site from site-local-config.xml: %s" % slc.siteName) with open('jobReport.json', 'w') as of: json.dump(report, of) with open('jobReportExtract.pickle', 'w') as of: pickle.dump(report, of) if ad and not "CRAB3_RUNTIME_DEBUG" in os.environ: stopDashboardMonitoring(ad) print("==== Report file creation FINISHING at %s ====" % time.asctime(time.gmtime())) except FwkJobReportException as FJRex: msg = "BadFWJRXML" handleException("FAILED", EC_ReportHandlingErr, msg) mintime()
handleException("FAILED", EC_PsetHash, exmsg) mintime() sys.exit(EC_PsetHash) if jobExitCode: #TODO check exitcode from fwjr rep['exitAcronym'] = "FAILED" rep['exitCode'] = jobExitCode rep['exitMsg'] = "Error while running CMSSW:\n" for error in rep['steps']['cmsRun']['errors']: rep['exitMsg'] += error['type'] + '\n' rep['exitMsg'] += error['details'] + '\n' else: rep['exitAcronym'] = "OK" rep['exitCode'] = 0 rep['exitMsg'] = "OK" slCfg = SiteLocalConfig.loadSiteLocalConfig() rep['executed_site'] = slCfg.siteName if 'phedex-node' in slCfg.localStageOut: rep['phedex_node'] = slCfg.localStageOut['phedex-node'] print("== Execution site from site-local-config.xml: %s" % slCfg.siteName) with open('jobReport.json', 'w') as of: json.dump(rep, of) with open('jobReportExtract.pickle', 'w') as of: pickle.dump(rep, of) if ad and not "CRAB3_RUNTIME_DEBUG" in os.environ: stopDashboardMonitoring(ad) print("==== Report file creation FINISHED at %s ====" % time.asctime(time.gmtime())) except FwkJobReportException as FJRex: extype = "BadFWJRXML"
"Unable to compute pset hash for job output") mintime() sys.exit(EC_PsetHash) if jobExitCode: #TODO check exitcode from fwjr report['exitAcronym'] = "FAILED" report['exitCode'] = jobExitCode report['exitMsg'] = "Error while running CMSSW:\n" for error in report['steps']['cmsRun']['errors']: report['exitMsg'] += error['type'] + '\n' report['exitMsg'] += error['details'] + '\n' else: report['exitAcronym'] = "OK" report['exitCode'] = 0 report['exitMsg'] = "OK" slc = SiteLocalConfig.loadSiteLocalConfig() report['executed_site'] = slc.siteName if 'phedex-node' in slc.localStageOut: report['phedex_node'] = slc.localStageOut['phedex-node'] print("== Execution site from site-local-config.xml: %s" % slc.siteName) with open('jobReport.json', 'w') as of: json.dump(report, of) with open('jobReportExtract.pickle', 'w') as of: pickle.dump(report, of) if ad and not "CRAB3_RUNTIME_DEBUG" in os.environ: stopDashboardMonitoring(ad) print("==== Report file creation FINISHING at %s ====" % time.asctime(time.gmtime())) except FwkJobReportException as FJRex: msg = "BadFWJRXML"
def handleException(exitAcronym, exitCode, exitMsg): #first save the traceback before it gets overwritten by other tracebacks (e.g.: wrong jobReport) formatted_tb = traceback.format_exc() report = {} try: if os.path.exists("jobReport.json"): report = json.load(open("jobReport.json")) else: print( "WARNING: WMCore did not produce a jobReport.json; FJR will not be useful." ) except Exception: print( "WARNING: Unable to parse WMCore's jobReport.json; FJR will not be useful.\n", traceback.format_exc()) if report.get('steps', {}).get('cmsRun', {}).get('errors'): exitMsg += '\nCMSSW error message follows.\n' for err in report['steps']['cmsRun']['errors']: if 'exitCode' in err: try: exitCode = int(exitCode) fjrExitCode = int(err['exitCode']) if (fjrExitCode % 256 == exitCode) and (fjrExitCode != exitCode): print( "NOTE: FJR has exit code %d and WMCore reports %d; preferring the FJR one." % (fjrExitCode, exitCode)) exitCode = fjrExitCode except ValueError: pass exitMsg += err['type'] + '\n' exitMsg += err['details'] + '\n' report['exitAcronym'] = exitAcronym report['exitCode'] = exitCode # check size of message string passed by caller maxChars = 10 * 1000 if len(exitMsg) > maxChars: exitMsg = exitMsg[0:maxChars] + " + ... message truncated at 10k chars" report['exitMsg'] = exitMsg print("ERROR: Exceptional exit at %s (%s): %s" % (time.asctime(time.gmtime()), str(exitCode), str(exitMsg))) if not formatted_tb.startswith("None"): print("ERROR: Traceback follows:\n", formatted_tb) try: sLCfg = SiteLocalConfig.loadSiteLocalConfig() report['executed_site'] = sLCfg.siteName print( "== Execution site for failed job from site-local-config.xml: %s" % sLCfg.siteName) except Exception: print( "ERROR: Failed to record execution site name in the FJR from the site-local-config.xml" ) print(traceback.format_exc()) with open('jobReport.json', 'w') as rf: json.dump(report, rf)
handleException("FAILED", EC_PsetHash, exmsg) mintime() sys.exit(EC_PsetHash) if jobExitCode: #TODO check exitcode from fwjr rep['exitAcronym'] = "FAILED" rep['exitCode'] = jobExitCode rep['exitMsg'] = "Error while running CMSSW:\n" for error in rep['steps']['cmsRun']['errors']: rep['exitMsg'] += error['type'] + '\n' rep['exitMsg'] += error['details'] + '\n' else: rep['exitAcronym'] = "OK" rep['exitCode'] = 0 rep['exitMsg'] = "OK" slCfg = SiteLocalConfig.loadSiteLocalConfig() rep['executed_site'] = slCfg.siteName if 'phedex-node' in slCfg.localStageOut: rep['phedex_node'] = slCfg.localStageOut['phedex-node'] print("== Execution site from site-local-config.xml: %s" % slCfg.siteName) with open('jobReport.json', 'w') as of: json.dump(rep, of) with open('jobReportExtract.pickle', 'w') as of: pickle.dump(rep, of) if ad and not "CRAB3_RUNTIME_DEBUG" in os.environ: stopDashboardMonitoring(ad) print("==== Report file creation FINISHED at %s ====" % time.asctime(time.gmtime())) except FwkJobReportException as FJRex: extype = "BadFWJRXML" handleException("FAILED", EC_ReportHandlingErr, extype) mintime()