Exemple #1
0
    def test_tail(self):
        """
        _tail_

        Can we tail a file?
        """



        a = "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"

        f = open('tmpfile.tmp', 'w')
        f.write(a)
        f.close()



        self.assertEqual(BasicAlgos.tail('tmpfile.tmp', 10),
                         ['g\n', 'h\n', 'i\n', 'j\n', 'k\n',
                          'l\n', 'm\n', 'n\n', 'o\n', 'p\n'])

        self.assertEqual(BasicAlgos.tail('tmpfile.tmp', 2),
                         ['o\n', 'p\n'])


        os.remove('tmpfile.tmp')


        return
Exemple #2
0
    def testStripReport(self):
        """
        _testStripReport_

        Test whether or not we can strip input file information
        from a FWJR and create a smaller object.
        """

        myReport = Report("cmsRun1")
        myReport.parse(self.xmlPath)

        path1 = os.path.join(self.testDir, 'testReport1.pkl')
        path2 = os.path.join(self.testDir, 'testReport2.pkl')

        myReport.save(path1)
        info = BasicAlgos.getFileInfo(filename = path1)
        self.assertEqual(info['Size'], 7101)

        inputFiles = myReport.getAllInputFiles()
        self.assertEqual(len(inputFiles), 1)
        myReport.stripInputFiles()
        self.assertEqual(len(myReport.getAllInputFiles()), 0)

        myReport.save(path2)
        info = BasicAlgos.getFileInfo(filename = path2)
        self.assertEqual(info['Size'], 6210)

        return
Exemple #3
0
    def test_tail(self):
        """
        _tail_

        Can we tail a file?
        """



        a = "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"

        f = open('tmpfile.tmp', 'w')
        f.write(a)
        f.close()



        self.assertEqual(BasicAlgos.tail('tmpfile.tmp', 10),
                         ['g\n', 'h\n', 'i\n', 'j\n', 'k\n',
                          'l\n', 'm\n', 'n\n', 'o\n', 'p\n'])

        self.assertEqual(BasicAlgos.tail('tmpfile.tmp', 2),
                         ['o\n', 'p\n'])


        os.remove('tmpfile.tmp')


        return
Exemple #4
0
    def testStripReport(self):
        """
        _testStripReport_

        Test whether or not we can strip input file information
        from a FWJR and create a smaller object.
        """

        myReport = Report("cmsRun1")
        myReport.parse(self.xmlPath)

        path1 = os.path.join(self.testDir, 'testReport1.pkl')
        path2 = os.path.join(self.testDir, 'testReport2.pkl')

        myReport.save(path1)
        info = BasicAlgos.getFileInfo(filename=path1)
        self.assertEqual(info['Size'], 6821)

        inputFiles = myReport.getAllInputFiles()
        self.assertEqual(len(inputFiles), 1)
        myReport.stripInputFiles()
        self.assertEqual(len(myReport.getAllInputFiles()), 0)

        myReport.save(path2)
        info = BasicAlgos.getFileInfo(filename=path2)
        self.assertEqual(info['Size'], 5933)

        return
Exemple #5
0
    def __call__(self, errCode, executor, **args):
        logging.critical("%s Diagnostic Handler invoked",
                         self.__class__.__name__)
        msg = "Error in CMSSW: %s\n" % (errCode)
        jobRepXml = os.path.join(executor.step.builder.workingDir,
                                 executor.step.output.jobReport)

        excepInst = args.get('ExceptionInstance', None)

        description = "Misc. CMSSW error"
        if excepInst:
            if hasattr(excepInst, 'detail'):
                description = excepInst.detail
            msg += str(excepInst)

        if os.path.exists(jobRepXml):
            # job report XML exists, load the exception information from it
            try:
                executor.report.parse(jobRepXml)
            except FwkJobReportException:
                # Job report is bad, the parse already puts a 50115 in the file
                pass
            reportStep = executor.report.retrieveStep(
                executor.step._internal_name)
            reportStep.status = errCode

        # Grab stderr log from CMSSW
        errLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stderr.log' % (executor.step._internal_name))
        outLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stdout.log' % (executor.step._internal_name))

        if os.path.exists(errLog):
            logTail = BasicAlgos.tail(errLog, DEFAULT_TAIL_LINES_FROM_LOG)
            msg += '\n Adding last %s lines of CMSSW stderr:\n' % DEFAULT_TAIL_LINES_FROM_LOG
            msg += logTail
        if os.path.exists(outLog):
            logTail = BasicAlgos.tail(outLog, DEFAULT_TAIL_LINES_FROM_LOG)
            msg += '\n Adding last %s lines of CMSSW stdout:\n' % DEFAULT_TAIL_LINES_FROM_LOG
            msg += logTail

        # If it exists, grab the SCRAM log
        errLog = os.path.join(os.path.dirname(jobRepXml), 'scramOutput.log')

        if os.path.exists(errLog):
            logTail = BasicAlgos.tail(errLog, 25)
            msg += '\n Adding last ten lines of SCRAM error log:\n'
            msg += logTail

        # make sure the report has the error in it
        dummy = getattr(executor.report.report, "errors",
                        None)  # Seems to do nothing
        executor.report.addError(executor.step._internal_name, errCode,
                                 description, msg)

        return
Exemple #6
0
    def __call__(self, errCode, executor, **args):
        logging.critical("%s Diagnostic Handler invoked", self.__class__.__name__)
        msg = "Error in CMSSW: %s\n" % (errCode)
        jobRepXml = os.path.join(executor.step.builder.workingDir,
                                 executor.step.output.jobReport)

        excepInst = args.get('ExceptionInstance', None)

        description = "Misc. CMSSW error"
        if excepInst:
            if hasattr(excepInst, 'detail'):
                description = excepInst.detail
            msg += str(excepInst)

        if os.path.exists(jobRepXml):
            # job report XML exists, load the exception information from it
            try:
                executor.report.parse(jobRepXml)
            except FwkJobReportException:
                # Job report is bad, the parse already puts a 50115 in the file
                pass
            reportStep = executor.report.retrieveStep(executor.step._internal_name)
            reportStep.status = errCode

        # Grab stderr log from CMSSW
        errLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stderr.log' % (executor.step._internal_name))
        outLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stdout.log' % (executor.step._internal_name))


        if os.path.exists(errLog):
            logTail = BasicAlgos.tail(errLog, DEFAULT_TAIL_LINES_FROM_LOG)
            msg += '\n Adding last %s lines of CMSSW stderr:\n' % DEFAULT_TAIL_LINES_FROM_LOG
            msg += logTail
        if os.path.exists(outLog):
            logTail = BasicAlgos.tail(outLog, DEFAULT_TAIL_LINES_FROM_LOG)
            msg += '\n Adding last %s lines of CMSSW stdout:\n' % DEFAULT_TAIL_LINES_FROM_LOG
            msg += logTail

        # If it exists, grab the SCRAM log
        errLog = os.path.join(os.path.dirname(jobRepXml),
                              'scramOutput.log')

        if os.path.exists(errLog):
            logTail = BasicAlgos.tail(errLog, 25)
            msg += '\n Adding last ten lines of SCRAM error log:\n'
            msg += logTail

        # make sure the report has the error in it
        dummy = getattr(executor.report.report, "errors", None)  # Seems to do nothing
        executor.report.addError(executor.step._internal_name,
                                 errCode, description, msg)

        return
Exemple #7
0
    def __call__(self, errCode, executor, **args):
        logging.critical("%s Diagnostic Handler invoked" %
                         self.__class__.__name__)
        msg = "Error in CMSSW: %s\n" % (errCode)
        jobRepXml = os.path.join(executor.step.builder.workingDir,
                                 executor.step.output.jobReport)

        excepInst = args.get('ExceptionInstance', None)

        description = "Misc. CMSSW error"
        if excepInst:
            if hasattr(excepInst, 'detail'):
                description = excepInst.detail
            msg += str(excepInst)

        if os.path.exists(jobRepXml):
            # job report XML exists, load the exception information from it
            executor.report.parse(jobRepXml)
            reportStep = executor.report.retrieveStep(
                executor.step._internal_name)
            reportStep.status = errCode

        # Grab stderr log from CMSSW
        errLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stderr.log' % (executor.step._internal_name))
        outLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stdout.log' % (executor.step._internal_name))

        if os.path.exists(errLog):
            logTail = BasicAlgos.tail(errLog, 10)
            msg += '\n Adding last ten lines of CMSSW stderr:\n'
            msg += "".join(logTail)
        if os.path.exists(outLog):
            logTail = BasicAlgos.tail(errLog, 10)
            msg += '\n Adding last ten lines of CMSSW stdout:\n'
            msg += "".join(logTail)

        # If it exists, grab the SCRAM log
        errLog = os.path.join(os.path.dirname(jobRepXml), 'scramOutput.log')

        if os.path.exists(errLog):
            logTail = BasicAlgos.tail(errLog, 25)
            msg += '\n Adding last ten lines of SCRAM error log:\n'
            msg += "".join(logTail)

        # make sure the report has the error in it
        errSection = getattr(executor.report.report, "errors", None)
        executor.report.addError(executor.step._internal_name, errCode,
                                 description, msg)

        return
Exemple #8
0
    def __call__(self, errCode, executor, **args):
        print "%s Diagnostic Handler invoked" % self.__class__.__name__
        msg = "Error in CMSSW: %s\n" % (errCode)
        jobRepXml = os.path.join(executor.step.builder.workingDir,
                                 executor.step.output.jobReport)

        excepInst = args.get('ExceptionInstance', None)

        description = "Misc. CMSSW error"
        if excepInst:
            if hasattr(excepInst, 'detail'):
                description = excepInst.detail
            msg += str(excepInst)
        
        if os.path.exists(jobRepXml):
            # job report XML exists, load the exception information from it
            executor.report.parse(jobRepXml)
            reportStep = executor.report.retrieveStep(executor.step._internal_name)
            reportStep.status = errCode

        # Grab stderr log from CMSSW
        errLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stderr.log' % (executor.step._internal_name))
        outLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stdout.log' % (executor.step._internal_name))

        if os.path.exists(errLog):
            logTail = BasicAlgos.tail(errLog, 10)
            msg += '\n Adding last ten lines of CMSSW stderr:\n'
            msg += "".join(logTail)
        if os.path.exists(outLog):
            logTail = BasicAlgos.tail(errLog, 10)
            msg += '\n Adding last ten lines of CMSSW stdout:\n'
            msg += "".join(logTail)

        # If it exists, grab the SCRAM log
        errLog = os.path.join(os.path.dirname(jobRepXml),
                              'scramOutput.log')

        if os.path.exists(errLog):
            logTail = BasicAlgos.tail(errLog, 25)
            msg += '\n Adding last ten lines of SCRAM error log:\n'
            msg += "".join(logTail)
                
        # make sure the report has the error in it
        errSection = getattr(executor.report.report, "errors", None)
        executor.report.addError(executor.step._internal_name,
                                 errCode, description, msg)


        return
Exemple #9
0
    def __call__(self, errCode, executor, **args):
        """
        Added for Steve to handle SCRAM script failure

        Must fail job (since SCRAM didn't run)

        """
        msg = "SCRAM scripts failed to run!\n"
        if args.get('ExceptionInstance', False):
            msg += str(args.get('ExceptionInstance'))

        jobReport = os.path.join(executor.step.builder.workingDir,
                                 executor.step.output.jobReport)
        errLog = os.path.join(os.path.dirname(jobReport),
                              'scramOutput.log')

        if os.path.exists(errLog):
            logTail = BasicAlgos.tail(errLog, 25)
            msg += '\n Adding last ten lines of SCRAM error log:\n'
            msg += "".join(logTail)

        executor.report.addError(executor.step._internal_name,
                                 50513, "SCRAMScriptFailure", msg)

        # Then mark the job as failed
        if executor.report.report.status == 0:
            executor.report.report.status = 1
    def complete(self, jobs):
        """
        Do any completion work required

        In this case, look for a returned logfile
        """

        for job in jobs:

            if job.get('cache_dir', None) is None or job.get('retry_count', None) is None:
                # Then we can't do anything
                logging.error("Can't find this job's cache_dir or retry count: %s", job)
                continue

            reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count'])
            if os.path.isfile(reportName) and os.path.getsize(reportName) > 0:
                # everything in order, move on
                continue
            elif os.path.isdir(reportName):
                # Then something weird has happened. Report error, do nothing
                logging.error("The job report for job with id %s and gridid %s is a directory", job['id'],
                              job['gridid'])
                logging.error("Ignoring this, but this is very strange")
            else:
                logging.error("No job report for job with id %s and gridid %s", job['id'], job['gridid'])

                if os.path.isfile(reportName):
                    os.remove(reportName)

                # create a report from scratch
                condorReport = Report()
                logOutput = 'Could not find jobReport\n'

                if os.path.isdir(job['cache_dir']):
                    condorOut = "condor.%s.out" % job['gridid']
                    condorErr = "condor.%s.err" % job['gridid']
                    condorLog = "condor.%s.log" % job['gridid']
                    for condorFile in [condorOut, condorErr, condorLog]:
                        condorFilePath = os.path.join(job['cache_dir'], condorFile)
                        if os.path.isfile(condorFilePath):
                            logTail = BasicAlgos.tail(condorFilePath, 50)
                            logOutput += 'Adding end of %s to error message:\n' % condorFile
                            logOutput += '\n'.join(logTail)
                    condorReport.addError("NoJobReport", 99303, "NoJobReport", logOutput)
                else:
                    msg = "Serious Error in Completing condor job with id %s!\n" % job['id']
                    msg += "Could not find jobCache directory %s\n" % job['cache_dir']
                    msg += "Creating a new cache_dir for failed job report\n"
                    logging.error(msg)
                    os.makedirs(job['cache_dir'])
                    condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput)

                condorReport.save(filename=reportName)

                logging.debug("Created failed job report for job with id %s and gridid %s", job['id'], job['gridid'])

        return
Exemple #11
0
    def __call__(self, errCode, executor, **args):
        print "%s Diagnostic Handler invoked" % self.__class__.__name__
        msg = "Exit %s: %s Exception from cmsRun" % (self.code, self.desc)
        jobRepXml = os.path.join(executor.step.builder.workingDir,
                                 executor.step.output.jobReport)

        if os.path.exists(jobRepXml):
            # job report XML exists, load the exception information from it
            try:
                executor.report.parse(jobRepXml)
            except FwkJobReportException:
                # Job report is bad, the parse already puts a 50115 in the file
                pass
            reportStep = executor.report.retrieveStep(executor.step._internal_name)
            reportStep.status = self.code


        errLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stderr.log' % (executor.step._internal_name))
        outLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stdout.log' % (executor.step._internal_name))

        if os.path.exists(errLog):
            logTail = BasicAlgos.tail(errLog, 10)
            msg += '\n Adding last ten lines of CMSSW stderr:\n'
            msg += "".join(logTail)
        if os.path.exists(outLog):
            logTail = BasicAlgos.tail(errLog, 10)
            msg += '\n Adding last ten lines of CMSSW stdout:\n'
            msg += "".join(logTail)

        # make sure the report has the error in it
        errSection = getattr(executor.report.report, "errors", None)
        if errSection == None:
            executor.report.addError(executor.step._internal_name,
                                     self.code, self.desc, msg)
        else:
            if not hasattr(errSection, self.desc):
                executor.report.addError(executor.step._internal_name,
                                         self.code, self.desc, msg)

        print executor.report.report.errors
        return
Exemple #12
0
    def __call__(self, errCode, executor, **args):
        print "%s Diagnostic Handler invoked" % self.__class__.__name__
        msg = "Exit %s: %s Exception from cmsRun" % (self.code, self.desc)
        jobRepXml = os.path.join(executor.step.builder.workingDir,
                                 executor.step.output.jobReport)

        if os.path.exists(jobRepXml):
            # job report XML exists, load the exception information from it
            try:
                executor.report.parse(jobRepXml)
            except FwkJobReportException:
                # Job report is bad, the parse already puts a 50115 in the file
                pass
            reportStep = executor.report.retrieveStep(
                executor.step._internal_name)
            reportStep.status = self.code

        errLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stderr.log' % (executor.step._internal_name))
        outLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stdout.log' % (executor.step._internal_name))

        if os.path.exists(errLog):
            logTail = BasicAlgos.tail(errLog, 10)
            msg += '\n Adding last ten lines of CMSSW stderr:\n'
            msg += "".join(logTail)
        if os.path.exists(outLog):
            logTail = BasicAlgos.tail(errLog, 10)
            msg += '\n Adding last ten lines of CMSSW stdout:\n'
            msg += "".join(logTail)

        # make sure the report has the error in it
        errSection = getattr(executor.report.report, "errors", None)
        if errSection == None:
            executor.report.addError(executor.step._internal_name, self.code,
                                     self.desc, msg)
        else:
            if not hasattr(errSection, self.desc):
                executor.report.addError(executor.step._internal_name,
                                         self.code, self.desc, msg)

        print executor.report.report.errors
        return
Exemple #13
0
def parseCondorLogs(logfile, extension):
    """
    Retrieve the last X lines of the log file
    """
    errLog = None
    logOut = ''

    logPaths = glob.glob(logfile)
    if len(logPaths):
        errLog = max(logPaths, key=lambda path: os.stat(path).st_mtime)
    if errLog is not None and os.path.isfile(errLog):
        logTail = BasicAlgos.tail(errLog, 50)
        logOut += 'Adding end of condor.%s to error message:\n' % extension
        logOut += logTail
        logOut += '\n\n'
    return logOut
def parseCondorLogs(logfile, extension):
    """
    Retrieve the last X lines of the log file
    """
    errLog = None
    logOut = ''

    logPaths = glob.glob(logfile)
    if len(logPaths):
        errLog = max(logPaths, key=lambda path: os.stat(path).st_mtime)
    if errLog is not None and os.path.isfile(errLog):
        logTail = BasicAlgos.tail(errLog, 50)
        logOut += 'Adding end of condor.%s to error message:\n' % extension
        logOut += logTail
        logOut += '\n\n'
    return logOut
Exemple #15
0
    def test_fileInfo(self):
        """
        _fileInfo_

        Test for basic file info
        """
        silly = "This is a rather ridiculous string"
        filename = os.path.join(self.testDir, 'fileInfo.test')

        f = open(filename, 'w')
        f.write(silly)
        f.close()

        info = BasicAlgos.getFileInfo(filename = filename)
        self.assertEqual(info['Name'], filename)
        self.assertEqual(info['Size'], 34)
        return
Exemple #16
0
    def test_fileInfo(self):
        """
        _fileInfo_

        Test for basic file info
        """
        silly = "This is a rather ridiculous string"
        filename = os.path.join(self.testDir, 'fileInfo.test')

        f = open(filename, 'w')
        f.write(silly)
        f.close()
        
        info = BasicAlgos.getFileInfo(filename = filename)
        self.assertEqual(info['Name'], filename)
        self.assertEqual(info['Size'], 34)
        return
Exemple #17
0
    def test_MD5(self):
        """
        _MD5_

        Check if we can create an MD5 checksum
        """

        silly = "This is a rather ridiculous string"
        filename = '/tmp/md5test.test'

        f = open(filename, 'w')
        f.write(silly)
        f.close()

        self.assertEqual(BasicAlgos.getMD5(filename = filename),
                         hashlib.md5(silly).hexdigest())

        os.remove(filename)
        return
Exemple #18
0
    def test_MD5(self):
        """
        _MD5_

        Check if we can create an MD5 checksum
        """

        silly = "This is a rather ridiculous string"
        filename = os.path.join(self.testDir, 'md5test.test')

        f = open(filename, 'w')
        f.write(silly)
        f.close()

        self.assertEqual(BasicAlgos.getMD5(filename = filename),
                         hashlib.md5(silly).hexdigest())

        os.remove(filename)
        return
Exemple #19
0
    def complete(self, jobs):
        """
        Do any completion work required

        In this case, look for a returned logfile
        """

        for job in jobs:
            if job.get("cache_dir", None) == None or job.get("retry_count", None) == None:
                # Then we can't do anything
                logging.error("Can't find this job's cache_dir in CondorPlugin.complete")
                logging.error("cache_dir: %s" % job.get("cache_dir", "Missing"))
                logging.error("retry_count: %s" % job.get("retry_count", "Missing"))
                continue
            reportName = os.path.join(job["cache_dir"], "Report.%i.pkl" % job["retry_count"])
            if os.path.isfile(reportName) and os.path.getsize(reportName) > 0:
                # Then we have a real report.
                # Do nothing
                continue
            if os.path.isdir(reportName):
                # Then something weird has happened.
                # File error, do nothing
                logging.error("Went to check on error report for job %i.  Found a directory instead.\n" % job["id"])
                logging.error("Ignoring this, but this is very strange.\n")

            # If we're still here, we must not have a real error report
            logOutput = "Could not find jobReport"
            logPath = os.path.join(job["cache_dir"], "condor.log")
            if os.path.isfile(logPath):
                logTail = BasicAlgos.tail(errLog, 50)
                logOutput += "Adding end of condor.log to error message:\n"
                logOutput += logTail
            condorReport = Report()
            condorReport.addError("NoJobReport", 61303, "NoJobReport", logOutput)
            condorReport.save(filename=reportName)
            logging.debug("No returning job report for job %i" % job["id"])

        return
Exemple #20
0
    def execute(self, emulator = None, **overrides):
        """
        _execute_


        """
        # Are we using emulators again?
        if (emulator != None):
            return emulator.emulate( self.step, self.job )

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Find alternate stageout location
        self.altLFN = overrides.get('altLFN', None)

        logging.info("Beginning Steps.Executors.LogArchive.Execute")
        logging.info("Using the following overrides: %s " % overrides)
        logging.info("Step is: %s" % self.step)
        # Wait timout for stageOut
        waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))

        matchFiles = [
            ".log$",
            "FrameworkJobReport",
            "Report.pkl",
            "Report.pcl",
            "^PSet.py$",
            "^PSet.pkl$"
            ]

        #Okay, we need a stageOut Manager
        useNewStageOutCode = False
        if getattr(self.step, 'newStageout', False) or \
            (overrides.has_key('newStageOut') and overrides.get('newStageOut')):
            useNewStageOutCode = True
        if not useNewStageOutCode:
            # old style
            manager = StageOutMgr.StageOutMgr(**overrides)
            manager.numberOfRetries = self.step.retryCount
            manager.retryPauseTime  = self.step.retryDelay
        else:
            # new style
            logging.info("LOGARCHIVE IS USING NEW STAGEOUT CODE")
            manager = WMCore.Storage.FileManager.StageOutMgr(
                                retryPauseTime  = self.step.retryDelay,
                                numberOfRetries = self.step.retryCount,
                                **overrides)

        #Now we need to find all the reports
        logFilesForTransfer = []
        #Look in the taskSpace first
        logFilesForTransfer.extend(self.findFilesInDirectory(self.stepSpace.taskSpace.location, matchFiles))

        #What if it's empty?
        if len(logFilesForTransfer) == 0:
            msg = "Could find no log files in job"
            logging.error(msg)
            return logFilesForTransfer

        #Now that we've gone through all the steps, we have to tar it out
        tarName         = 'logArchive.tar.gz'
        tarBallLocation = os.path.join(self.stepSpace.location, tarName)
        tarBall         = tarfile.open(tarBallLocation, 'w:gz')
        for f in logFilesForTransfer:
            tarBall.add(name  = f,
                        arcname = f.replace(self.stepSpace.taskSpace.location, '', 1).lstrip('/'))
        tarBall.close()


        fileInfo = {'LFN': self.getLFN(tarName),
            'PFN' : tarBallLocation,
            'SEName' : None,
            'GUID' : None
            }

        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            manager(fileInfo)
            self.report.addOutputModule(moduleName = "logArchive")
            reportFile = {"lfn": fileInfo["LFN"], "pfn": fileInfo["PFN"],
                          "location": fileInfo["SEName"], "module_label": "logArchive",
                          "events": 0, "size": 0, "merged": False,
                          "checksums": {'md5': BasicAlgos.getMD5(tarBallLocation),
                                        'adler32': readAdler32(tarBallLocation),
                                        'cksum': readCksum(tarBallLocation)}}
            self.report.addOutputFile(outputModule = "logArchive", file = reportFile)
        except Alarm:
            msg = "Indefinite hang during stageOut of logArchive"
            logging.error(msg)
            self.report.addError(self.stepName, 60404, "LogArchiveTimeout", msg)
            self.report.persist("Report.pkl")
            raise WMExecutionFailure(60404, "LogArchiveTimeout", msg)
        except WMException, ex:
            self.report.addError(self.stepName, 60307, "LogArchiveFailure", str(ex))
            self.report.setStepStatus(self.stepName, 0)
            self.report.persist("Report.pkl")
            raise ex
Exemple #21
0
    def complete(self, jobs):
        """
        Do any completion work required

        In this case, look for a returned logfile
        """

        for job in jobs:

            if job.get('cache_dir', None) is None or job.get(
                    'retry_count', None) is None:
                # Then we can't do anything
                logging.error(
                    "Can't find this job's cache_dir or retry count: %s", job)
                continue

            reportName = os.path.join(job['cache_dir'],
                                      'Report.%i.pkl' % job['retry_count'])
            if os.path.isfile(reportName) and os.path.getsize(reportName) > 0:
                # everything in order, move on
                continue
            elif os.path.isdir(reportName):
                # Then something weird has happened. Report error, do nothing
                logging.error(
                    "The job report for job with id %s and gridid %s is a directory",
                    job['id'], job['gridid'])
                logging.error("Ignoring this, but this is very strange")
            else:
                logging.error("No job report for job with id %s and gridid %s",
                              job['id'], job['gridid'])

                if os.path.isfile(reportName):
                    os.remove(reportName)

                # create a report from scratch
                condorReport = Report()
                logOutput = 'Could not find jobReport\n'

                if os.path.isdir(job['cache_dir']):
                    condorErr = "condor.%s.err" % job['gridid']
                    condorOut = "condor.%s.out" % job['gridid']
                    condorLog = "condor.%s.log" % job['gridid']
                    exitCode = 99303
                    exitType = "NoJobReport"
                    for condorFile in [condorErr, condorOut, condorLog]:
                        condorFilePath = os.path.join(job['cache_dir'],
                                                      condorFile)
                        logOutput += "\n========== %s ==========\n" % condorFile
                        if os.path.isfile(condorFilePath):
                            logTail = BasicAlgos.tail(condorFilePath, 50)
                            logOutput += 'Adding end of %s to error message:\n\n' % condorFile
                            logOutput += logTail
                            logOutput += '\n\n'

                            if condorFile == condorLog:
                                # for condor log, search for the information
                                for matchObj in getIterMatchObjectOnRegexp(
                                        condorFilePath,
                                        CONDOR_LOG_FILTER_REGEXP):
                                    condorReason = matchObj.group("Reason")
                                    if condorReason:
                                        logOutput += condorReason
                                        if "SYSTEM_PERIODIC_REMOVE" in condorReason or "via condor_rm" in condorReason:
                                            exitCode = 99400
                                            exitType = "RemovedByGLIDEIN"
                                        else:
                                            exitCode = 99401

                                    siteName = matchObj.group("Site")
                                    if siteName:
                                        condorReport.data.siteName = siteName
                                    else:
                                        condorReport.data.siteName = "NoReportedSite"
                            else:
                                for matchObj in getIterMatchObjectOnRegexp(
                                        condorFilePath, WMEXCEPTION_REGEXP):
                                    errMsg = matchObj.group('WMException')
                                    if errMsg:
                                        logOutput += "\n\n%s\n" % errMsg

                                    errMsg = matchObj.group('ERROR')
                                    if errMsg:
                                        logOutput += "\n\n%s\n" % errMsg

                    logOutput += '\n\n'
                    condorReport.addError(exitType, exitCode, exitType,
                                          logOutput)
                else:
                    msg = "Serious Error in Completing condor job with id %s!\n" % job[
                        'id']
                    msg += "Could not find jobCache directory %s\n" % job[
                        'cache_dir']
                    msg += "Creating a new cache_dir for failed job report\n"
                    logging.error(msg)
                    os.makedirs(job['cache_dir'])
                    condorReport.addError("NoJobReport", 99304, "NoCacheDir",
                                          logOutput)

                condorReport.save(filename=reportName)

                logging.debug(
                    "Created failed job report for job with id %s and gridid %s",
                    job['id'], job['gridid'])

        return
    def complete(self, jobs):
        """
        Do any completion work required

        In this case, look for a returned logfile
        """

        for job in jobs:
            if job.get('cache_dir', None) == None or job.get('retry_count', None) == None:
                # Then we can't do anything
                logging.error("Can't find this job's cache_dir in CondorPlugin.complete")
                logging.error("cache_dir: %s" % job.get('cache_dir', 'Missing'))
                logging.error("retry_count: %s" % job.get('retry_count', 'Missing'))
                continue
            reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count'])
            if os.path.isfile(reportName) and os.path.getsize(reportName) > 0:
                # Then we have a real report.
                # Do nothing
                continue
            if os.path.isdir(reportName):
                # Then something weird has happened.
                # File error, do nothing
                logging.error("Went to check on error report for job %i.  Found a directory instead.\n" % job['id'])
                logging.error("Ignoring this, but this is very strange.\n")

            # If we're still here, we must not have a real error report
            logOutput = 'Could not find jobReport\n'
            #But we don't know exactly the condor id, so it will append
            #the last lines of the latest condor log in cache_dir
            genLogPath = os.path.join(job['cache_dir'], 'condor.*.*.log')
            logPaths = glob.glob(genLogPath)
            errLog = None
            if len(logPaths):
                errLog = max(logPaths, key = lambda path :
                                                    os.stat(path).st_mtime)
            if errLog != None and os.path.isfile(errLog):
                logTail = BasicAlgos.tail(errLog, 50)
                logOutput += 'Adding end of condor.log to error message:\n'
                logOutput += '\n'.join(logTail)
            if not os.path.isdir(job['cache_dir']):
                msg =  "Serious Error in Completing condor job with id %s!\n" % job.get('id', 'unknown')
                msg += "Could not find jobCache directory - directory deleted under job: %s\n" % job['cache_dir']
                msg += "Creating artificial cache_dir for failed job report\n"
                logging.error(msg)
                os.makedirs(job['cache_dir'])
                logOutput += msg
                condorReport = Report()
                condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput)
                condorReport.save(filename = reportName)
                continue
            condorReport = Report()
            condorReport.addError("NoJobReport", 99303, "NoJobReport", logOutput)
            if os.path.isfile(reportName):
                # Then we have a file already there.  It should be zero size due
                # to the if statements above, but we should remove it.
                if os.path.getsize(reportName) > 0:
                    # This should never happen.  If it does, ignore it
                    msg =  "Critical strange problem.  FWJR changed size while being processed."
                    logging.error(msg)
                else:
                    try:
                        os.remove(reportName)
                        condorReport.save(filename = reportName)
                    except Exception as ex:
                        logging.error("Cannot remove and replace empty report %s" % reportName)
                        logging.error("Report continuing without error!")
            else:
                condorReport.save(filename = reportName)

            # Debug message to end loop
            logging.debug("No returning job report for job %i" % job['id'])


        return
Exemple #23
0
    def complete(self, jobs):
        """
        Do any completion work required

        In this case, look for a returned logfile
        """

        for job in jobs:

            if job.get('cache_dir', None) is None or job.get('retry_count', None) is None:
                # Then we can't do anything
                logging.error("Can't find this job's cache_dir or retry count: %s", job)
                continue

            reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count'])
            if os.path.isfile(reportName) and os.path.getsize(reportName) > 0:
                # everything in order, move on
                continue
            elif os.path.isdir(reportName):
                # Then something weird has happened. Report error, do nothing
                logging.error("The job report for job with id %s and gridid %s is a directory", job['id'],
                              job['gridid'])
                logging.error("Ignoring this, but this is very strange")
            else:
                logging.error("No job report for job with id %s and gridid %s", job['id'], job['gridid'])

                if os.path.isfile(reportName):
                    os.remove(reportName)

                # create a report from scratch
                condorReport = Report()
                logOutput = 'Could not find jobReport\n'

                if os.path.isdir(job['cache_dir']):
                    condorErr = "condor.%s.err" % job['gridid']
                    condorOut = "condor.%s.out" % job['gridid']
                    condorLog = "condor.%s.log" % job['gridid']
                    exitCode = 99303
                    exitType = "NoJobReport"
                    for condorFile in [condorErr, condorOut, condorLog]:
                        condorFilePath = os.path.join(job['cache_dir'], condorFile)
                        logOutput += "\n========== %s ==========\n" % condorFile
                        if os.path.isfile(condorFilePath):
                            logTail = BasicAlgos.tail(condorFilePath, 50)
                            logOutput += 'Adding end of %s to error message:\n\n' % condorFile
                            logOutput += logTail
                            logOutput += '\n\n'

                            if condorFile == condorLog:
                                # for condor log, search for the information
                                for matchObj in getIterMatchObjectOnRegexp(condorFilePath, CONDOR_LOG_FILTER_REGEXP):
                                    condorReason = matchObj.group("Reason")
                                    if condorReason:
                                        logOutput += condorReason
                                        if "SYSTEM_PERIODIC_REMOVE" in condorReason or "via condor_rm" in condorReason:
                                            exitCode = 99400
                                            exitType = "RemovedByGLIDEIN"
                                        else:
                                            exitCode = 99401

                                    siteName = matchObj.group("Site")
                                    if siteName:
                                        condorReport.data.siteName = siteName
                                    else:
                                        condorReport.data.siteName = "NoReportedSite"
                            else:
                                for matchObj in getIterMatchObjectOnRegexp(condorFilePath, WMEXCEPTION_REGEXP):
                                    errMsg = matchObj.group('WMException')
                                    if errMsg:
                                        logOutput += "\n\n%s\n" % errMsg

                                    errMsg = matchObj.group('ERROR')
                                    if errMsg:
                                        logOutput += "\n\n%s\n" % errMsg

                    logOutput += '\n\n'
                    condorReport.addError(exitType, exitCode, exitType, logOutput)
                else:
                    msg = "Serious Error in Completing condor job with id %s!\n" % job['id']
                    msg += "Could not find jobCache directory %s\n" % job['cache_dir']
                    msg += "Creating a new cache_dir for failed job report\n"
                    logging.error(msg)
                    os.makedirs(job['cache_dir'])
                    condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput)

                condorReport.save(filename=reportName)

                logging.debug("Created failed job report for job with id %s and gridid %s", job['id'], job['gridid'])

        return
Exemple #24
0
    def __call__(self, errCode, executor, **args):
        """
        _operator()_

        Look for the XML job report, try and read it and extract the error information from it

        """
        jobRepXml = os.path.join(executor.step.builder.workingDir,
                                 executor.step.output.jobReport)

        errLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stderr.log' % (executor.step._internal_name))
        outLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stdout.log' % (executor.step._internal_name))

        addOn = '\n'
        if os.path.exists(errLog):
            logTail = BasicAlgos.tail(errLog, 10)
            addOn += '\nAdding last ten lines of CMSSW stderr:\n'
            addOn += "".join(logTail)
        else:
            logging.error("No stderr from CMSSW")
            logging.error(os.listdir(os.path.basename(jobRepXml)))

        if os.path.exists(outLog):
            logTail = BasicAlgos.tail(errLog, 10)
            msg = '\n Adding last ten lines of CMSSW stdout:\n'
            msg += "".join(logTail)

        # Add the error we were sent
        ex = args.get('ExceptionInstance', None)
        executor.report.addError(executor.step._internal_name,
                                 errCode, "CMSSWStepFailure", msg + str(ex))

        if not os.path.exists(jobRepXml):
            # no report => Error
            msg = "No Job Report Found: %s" % jobRepXml
            executor.report.addError(executor.step._internal_name,
                                     50115, "MissingJobReport", msg)
            return

        # job report XML exists, load the exception information from it
        try:
            executor.report.parse(jobRepXml)
        except FwkJobReportException:
            # Job report is bad, the parse already puts a 50115 in the file
            # just go on
            pass

        # make sure the report has the error in it
        errSection = getattr(executor.report.report, "errors", None)
        if errSection == None:
            msg = "Job Report contains no error report, but cmsRun exited non-zero: %s" % errCode
            msg += addOn
            executor.report.addError(executor.step._internal_name,
                                     50116, "MissingErrorReport", msg)
            return

        else:
            # check exit code in report is non zero
            if executor.report.report.status == 0:
                msg = "Job Report contains no error report, but cmsRun exited non-zero: %s" % errCode
                msg += addOn
                executor.report.addError(executor.step._internal_name,
                                         50116, "MissingErrorReport", msg)

            else:
                msg = "Adding extra error in order to hold error report"
                msg += addOn
                executor.report.addError(executor.step._internal_name,
                                         99999, "ErrorLoggingAddition", msg)
        return
Exemple #25
0
    def complete(self, jobs):
        """
        Do any completion work required

        In this case, look for a returned logfile
        """

        for job in jobs:
            if job.get('cache_dir', None) == None or job.get('retry_count', None) == None:
                # Then we can't do anything
                logging.error("Can't find this job's cache_dir in CondorPlugin.complete")
                logging.error("cache_dir: %s" % job.get('cache_dir', 'Missing'))
                logging.error("retry_count: %s" % job.get('retry_count', 'Missing'))
                continue
            reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count'])
            if os.path.isfile(reportName) and os.path.getsize(reportName) > 0:
                # Then we have a real report.
                # Do nothing
                continue
            if os.path.isdir(reportName):
                # Then something weird has happened.
                # File error, do nothing
                logging.error("Went to check on error report for job %i.  Found a directory instead.\n" % job['id'])
                logging.error("Ignoring this, but this is very strange.\n")

            # If we're still here, we must not have a real error report
            logOutput = 'Could not find jobReport\n'
            #But we don't know exactly the condor id, so it will append
            #the last lines of the latest condor log in cache_dir
            genLogPath = os.path.join(job['cache_dir'], 'condor.*.*.log')
            logPaths = glob.glob(genLogPath)
            errLog = None
            if len(logPaths):
                errLog = max(logPaths, key = lambda path :
                                                    os.stat(path).st_mtime)
            if errLog != None and os.path.isfile(errLog):
                logTail = BasicAlgos.tail(errLog, 50)
                logOutput += 'Adding end of condor.log to error message:\n'
                logOutput += '\n'.join(logTail)
            if not os.path.isdir(job['cache_dir']):
                msg =  "Serious Error in Completing condor job with id %s!\n" % job.get('id', 'unknown')
                msg += "Could not find jobCache directory - directory deleted under job: %s\n" % job['cache_dir']
                msg += "Creating artificial cache_dir for failed job report\n"
                logging.error(msg)
                os.makedirs(job['cache_dir'])
                logOutput += msg
                condorReport = Report()
                condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput)
                condorReport.save(filename = reportName)
                continue
            condorReport = Report()
            condorReport.addError("NoJobReport", 99303, "NoJobReport", logOutput)
            if os.path.isfile(reportName):
                # Then we have a file already there.  It should be zero size due
                # to the if statements above, but we should remove it.
                if os.path.getsize(reportName) > 0:
                    # This should never happen.  If it does, ignore it
                    msg =  "Critical strange problem.  FWJR changed size while being processed."
                    logging.error(msg)
                else:
                    try:
                        os.remove(reportName)
                        condorReport.save(filename = reportName)
                    except Exception as ex:
                        logging.error("Cannot remove and replace empty report %s" % reportName)
                        logging.error("Report continuing without error!")
            else:
                condorReport.save(filename = reportName)

            # Debug message to end loop
            logging.debug("No returning job report for job %i" % job['id'])


        return
    def complete(self, jobs):
        """
        Do any completion work required

        In this case, look for a returned logfile
        """

        for job in jobs:

            if job.get('cache_dir', None) is None or job.get(
                    'retry_count', None) is None:
                # Then we can't do anything
                logging.error(
                    "Can't find this job's cache_dir or retry count: %s", job)
                continue

            reportName = os.path.join(job['cache_dir'],
                                      'Report.%i.pkl' % job['retry_count'])
            if os.path.isfile(reportName) and os.path.getsize(reportName) > 0:
                # everything in order, move on
                continue
            elif os.path.isdir(reportName):
                # Then something weird has happened. Report error, do nothing
                logging.error(
                    "The job report for job with id %s and gridid %s is a directory",
                    job['id'], job['gridid'])
                logging.error("Ignoring this, but this is very strange")
            else:
                logging.error("No job report for job with id %s and gridid %s",
                              job['id'], job['gridid'])

                if os.path.isfile(reportName):
                    os.remove(reportName)

                # create a report from scratch
                condorReport = Report()
                logOutput = 'Could not find jobReport\n'

                if os.path.isdir(job['cache_dir']):
                    condorOut = "condor.%s.out" % job['gridid']
                    condorErr = "condor.%s.err" % job['gridid']
                    condorLog = "condor.%s.log" % job['gridid']
                    for condorFile in [condorOut, condorErr, condorLog]:
                        condorFilePath = os.path.join(job['cache_dir'],
                                                      condorFile)
                        if os.path.isfile(condorFilePath):
                            logTail = BasicAlgos.tail(condorFilePath, 50)
                            logOutput += 'Adding end of %s to error message:\n' % condorFile
                            logOutput += '\n'.join(logTail)
                    condorReport.addError("NoJobReport", 99303, "NoJobReport",
                                          logOutput)
                else:
                    msg = "Serious Error in Completing condor job with id %s!\n" % job[
                        'id']
                    msg += "Could not find jobCache directory %s\n" % job[
                        'cache_dir']
                    msg += "Creating a new cache_dir for failed job report\n"
                    logging.error(msg)
                    os.makedirs(job['cache_dir'])
                    condorReport.addError("NoJobReport", 99304, "NoCacheDir",
                                          logOutput)

                condorReport.save(filename=reportName)

                logging.debug(
                    "Created failed job report for job with id %s and gridid %s",
                    job['id'], job['gridid'])

        return