def handleException(exitAcronym, exitCode, exitMsg):
    #first save the traceback before it gets overwritten by other tracebacks (e.g.: wrong jobReport)
    formatted_tb = traceback.format_exc()

    report = {}
    try:
        if os.path.exists("jobReport.json"):
            report = json.load(open("jobReport.json"))
        else:
            print(
                "WARNING: WMCore did not produce a jobReport.json; FJR will not be useful."
            )
    except:
        print(
            "WARNING: Unable to parse WMCore's jobReport.json; FJR will not be useful.\n",
            traceback.format_exc())

    if report.get('steps', {}).get('cmsRun', {}).get('errors'):
        exitMsg += '\nCMSSW error message follows.\n'
        for error in report['steps']['cmsRun']['errors']:
            if 'exitCode' in error:
                try:
                    exitCode = int(exitCode)
                    fjrExitCode = int(error['exitCode'])
                    if (fjrExitCode % 256
                            == exitCode) and (fjrExitCode != exitCode):
                        print(
                            "NOTE: FJR has exit code %d and WMCore reports %d; preferring the FJR one."
                            % (fjrExitCode, exitCode))
                        exitCode = fjrExitCode
                except ValueError:
                    pass
            exitMsg += error['type'] + '\n'
            exitMsg += error['details'] + '\n'

    report['exitAcronym'] = exitAcronym
    report['exitCode'] = exitCode
    report['exitMsg'] = exitMsg
    print("ERROR: Exceptional exit at %s (%s): %s" %
          (time.asctime(time.gmtime()), str(exitCode), str(exitMsg)))
    if not formatted_tb.startswith("None"):
        print("ERROR: Traceback follows:\n", formatted_tb)

    try:
        slc = SiteLocalConfig.loadSiteLocalConfig()
        report['executed_site'] = slc.siteName
        print(
            "== Execution site for failed job from site-local-config.xml: %s" %
            slc.siteName)
    except:
        print(
            "ERROR: Failed to record execution site name in the FJR from the site-local-config.xml"
        )
        print(traceback.format_exc())

    with open('jobReport.json', 'w') as of:
        json.dump(report, of)
    if ad and not "CRAB3_RUNTIME_DEBUG" in os.environ:
        stopDashboardMonitoring(ad)
def handleException(exitAcronym, exitCode, exitMsg):
    #first save the traceback before it gets overwritten by other tracebacks (e.g.: wrong jobReport)
    formatted_tb = traceback.format_exc()

    report = {}
    try:
        if os.path.exists("jobReport.json"):
            report = json.load(open("jobReport.json"))
        else:
            print("WARNING: WMCore did not produce a jobReport.json; FJR will not be useful.")
    except:
        print("WARNING: Unable to parse WMCore's jobReport.json; FJR will not be useful.\n", traceback.format_exc())

    if report.get('steps', {}).get('cmsRun', {}).get('errors'):
        exitMsg += '\nCMSSW error message follows.\n'
        for error in report['steps']['cmsRun']['errors']:
            if 'exitCode' in error:
                try:
                    exitCode = int(exitCode)
                    fjrExitCode = int(error['exitCode'])
                    if (fjrExitCode % 256 == exitCode) and (fjrExitCode != exitCode):
                        print("NOTE: FJR has exit code %d and WMCore reports %d; preferring the FJR one." % (fjrExitCode, exitCode))
                        exitCode = fjrExitCode
                except ValueError:
                    pass
            exitMsg += error['type'] + '\n'
            exitMsg += error['details'] + '\n'

    report['exitAcronym'] = exitAcronym
    report['exitCode'] = exitCode
    report['exitMsg'] = exitMsg
    print("ERROR: Exceptional exit at %s (%s): %s" % (time.asctime(time.gmtime()), str(exitCode), str(exitMsg)))
    if not formatted_tb.startswith("None"):
        print("ERROR: Traceback follows:\n", formatted_tb)

    try:
        slc = SiteLocalConfig.loadSiteLocalConfig()
        report['executed_site'] = slc.siteName
        print("== Execution site for failed job from site-local-config.xml: %s" % slc.siteName)
    except:
        print("ERROR: Failed to record execution site name in the FJR from the site-local-config.xml")
        print(traceback.format_exc())

    with open('jobReport.json', 'w') as of:
        json.dump(report, of)
    if ad and not "CRAB3_RUNTIME_DEBUG" in os.environ:
        stopDashboardMonitoring(ad)
                handleException("FAILED", EC_PsetHash, "Unable to compute pset hash for job output")
                mintime()
                sys.exit(EC_PsetHash)
        if jobExitCode: #TODO check exitcode from fwjr
            report['exitAcronym'] = "FAILED"
            report['exitCode'] = jobExitCode
            report['exitMsg'] = "Error while running CMSSW:\n"
            for error in report['steps']['cmsRun']['errors']:
                report['exitMsg'] += error['type'] + '\n'
                report['exitMsg'] += error['details'] + '\n'
        else:
            report['exitAcronym'] = "OK"
            report['exitCode'] = 0
            report['exitMsg'] = "OK"

        slc = SiteLocalConfig.loadSiteLocalConfig()
        report['executed_site'] = slc.siteName
        if 'phedex-node' in slc.localStageOut:
            report['phedex_node'] = slc.localStageOut['phedex-node']
        print("== Execution site from site-local-config.xml: %s" % slc.siteName)
        with open('jobReport.json', 'w') as of:
            json.dump(report, of)
        with open('jobReportExtract.pickle', 'w') as of:
            pickle.dump(report, of)
        if ad and not "CRAB3_RUNTIME_DEBUG" in os.environ:
            stopDashboardMonitoring(ad)
        print("==== Report file creation FINISHING at %s ====" % time.asctime(time.gmtime()))
    except FwkJobReportException as FJRex:
        msg = "BadFWJRXML"
        handleException("FAILED", EC_ReportHandlingErr, msg)
        mintime()
Beispiel #4
0
            handleException("FAILED", EC_PsetHash, exmsg)
            mintime()
            sys.exit(EC_PsetHash)
        if jobExitCode:  #TODO check exitcode from fwjr
            rep['exitAcronym'] = "FAILED"
            rep['exitCode'] = jobExitCode
            rep['exitMsg'] = "Error while running CMSSW:\n"
            for error in rep['steps']['cmsRun']['errors']:
                rep['exitMsg'] += error['type'] + '\n'
                rep['exitMsg'] += error['details'] + '\n'
        else:
            rep['exitAcronym'] = "OK"
            rep['exitCode'] = 0
            rep['exitMsg'] = "OK"

        slCfg = SiteLocalConfig.loadSiteLocalConfig()
        rep['executed_site'] = slCfg.siteName
        if 'phedex-node' in slCfg.localStageOut:
            rep['phedex_node'] = slCfg.localStageOut['phedex-node']
        print("== Execution site from site-local-config.xml: %s" %
              slCfg.siteName)
        with open('jobReport.json', 'w') as of:
            json.dump(rep, of)
        with open('jobReportExtract.pickle', 'w') as of:
            pickle.dump(rep, of)
        if ad and not "CRAB3_RUNTIME_DEBUG" in os.environ:
            stopDashboardMonitoring(ad)
        print("==== Report file creation FINISHED at %s ====" %
              time.asctime(time.gmtime()))
    except FwkJobReportException as FJRex:
        extype = "BadFWJRXML"
                                "Unable to compute pset hash for job output")
                mintime()
                sys.exit(EC_PsetHash)
        if jobExitCode:  #TODO check exitcode from fwjr
            report['exitAcronym'] = "FAILED"
            report['exitCode'] = jobExitCode
            report['exitMsg'] = "Error while running CMSSW:\n"
            for error in report['steps']['cmsRun']['errors']:
                report['exitMsg'] += error['type'] + '\n'
                report['exitMsg'] += error['details'] + '\n'
        else:
            report['exitAcronym'] = "OK"
            report['exitCode'] = 0
            report['exitMsg'] = "OK"

        slc = SiteLocalConfig.loadSiteLocalConfig()
        report['executed_site'] = slc.siteName
        if 'phedex-node' in slc.localStageOut:
            report['phedex_node'] = slc.localStageOut['phedex-node']
        print("== Execution site from site-local-config.xml: %s" %
              slc.siteName)
        with open('jobReport.json', 'w') as of:
            json.dump(report, of)
        with open('jobReportExtract.pickle', 'w') as of:
            pickle.dump(report, of)
        if ad and not "CRAB3_RUNTIME_DEBUG" in os.environ:
            stopDashboardMonitoring(ad)
        print("==== Report file creation FINISHING at %s ====" %
              time.asctime(time.gmtime()))
    except FwkJobReportException as FJRex:
        msg = "BadFWJRXML"
Beispiel #6
0
def handleException(exitAcronym, exitCode, exitMsg):
    #first save the traceback before it gets overwritten by other tracebacks (e.g.: wrong jobReport)
    formatted_tb = traceback.format_exc()

    report = {}
    try:
        if os.path.exists("jobReport.json"):
            report = json.load(open("jobReport.json"))
        else:
            print(
                "WARNING: WMCore did not produce a jobReport.json; FJR will not be useful."
            )
    except Exception:
        print(
            "WARNING: Unable to parse WMCore's jobReport.json; FJR will not be useful.\n",
            traceback.format_exc())

    if report.get('steps', {}).get('cmsRun', {}).get('errors'):
        exitMsg += '\nCMSSW error message follows.\n'
        for err in report['steps']['cmsRun']['errors']:
            if 'exitCode' in err:
                try:
                    exitCode = int(exitCode)
                    fjrExitCode = int(err['exitCode'])
                    if (fjrExitCode % 256
                            == exitCode) and (fjrExitCode != exitCode):
                        print(
                            "NOTE: FJR has exit code %d and WMCore reports %d; preferring the FJR one."
                            % (fjrExitCode, exitCode))
                        exitCode = fjrExitCode
                except ValueError:
                    pass
            exitMsg += err['type'] + '\n'
            exitMsg += err['details'] + '\n'

    report['exitAcronym'] = exitAcronym
    report['exitCode'] = exitCode

    # check size of message string passed by caller
    maxChars = 10 * 1000
    if len(exitMsg) > maxChars:
        exitMsg = exitMsg[0:maxChars] + " + ... message truncated at 10k chars"
    report['exitMsg'] = exitMsg
    print("ERROR: Exceptional exit at %s (%s): %s" %
          (time.asctime(time.gmtime()), str(exitCode), str(exitMsg)))
    if not formatted_tb.startswith("None"):
        print("ERROR: Traceback follows:\n", formatted_tb)

    try:
        sLCfg = SiteLocalConfig.loadSiteLocalConfig()
        report['executed_site'] = sLCfg.siteName
        print(
            "== Execution site for failed job from site-local-config.xml: %s" %
            sLCfg.siteName)
    except Exception:
        print(
            "ERROR: Failed to record execution site name in the FJR from the site-local-config.xml"
        )
        print(traceback.format_exc())

    with open('jobReport.json', 'w') as rf:
        json.dump(report, rf)
Beispiel #7
0
            handleException("FAILED", EC_PsetHash, exmsg)
            mintime()
            sys.exit(EC_PsetHash)
        if jobExitCode: #TODO check exitcode from fwjr
            rep['exitAcronym'] = "FAILED"
            rep['exitCode'] = jobExitCode
            rep['exitMsg'] = "Error while running CMSSW:\n"
            for error in rep['steps']['cmsRun']['errors']:
                rep['exitMsg'] += error['type'] + '\n'
                rep['exitMsg'] += error['details'] + '\n'
        else:
            rep['exitAcronym'] = "OK"
            rep['exitCode'] = 0
            rep['exitMsg'] = "OK"

        slCfg = SiteLocalConfig.loadSiteLocalConfig()
        rep['executed_site'] = slCfg.siteName
        if 'phedex-node' in slCfg.localStageOut:
            rep['phedex_node'] = slCfg.localStageOut['phedex-node']
        print("== Execution site from site-local-config.xml: %s" % slCfg.siteName)
        with open('jobReport.json', 'w') as of:
            json.dump(rep, of)
        with open('jobReportExtract.pickle', 'w') as of:
            pickle.dump(rep, of)
        if ad and not "CRAB3_RUNTIME_DEBUG" in os.environ:
            stopDashboardMonitoring(ad)
        print("==== Report file creation FINISHED at %s ====" % time.asctime(time.gmtime()))
    except FwkJobReportException as FJRex:
        extype = "BadFWJRXML"
        handleException("FAILED", EC_ReportHandlingErr, extype)
        mintime()