Exemple #1
0
    def complete(self, jobs):
        """
        Do any completion work required

        In this case, look for a returned logfile
        """

        for job in jobs:

            if job.get('cache_dir', None) is None or job.get(
                    'retry_count', None) is None:
                # Then we can't do anything
                logging.error(
                    "Can't find this job's cache_dir or retry count: %s", job)
                continue

            reportName = os.path.join(job['cache_dir'],
                                      'Report.%i.pkl' % job['retry_count'])
            if os.path.isfile(reportName) and os.path.getsize(reportName) > 0:
                # everything in order, move on
                continue
            elif os.path.isdir(reportName):
                # Then something weird has happened. Report error, do nothing
                logging.error(
                    "The job report for job with id %s and gridid %s is a directory",
                    job['id'], job['gridid'])
                logging.error("Ignoring this, but this is very strange")
            else:
                logging.error("No job report for job with id %s and gridid %s",
                              job['id'], job['gridid'])

                if os.path.isfile(reportName):
                    os.remove(reportName)

                # create a report from scratch
                condorReport = Report()
                logOutput = 'Could not find jobReport\n'

                if os.path.isdir(job['cache_dir']):
                    condorErr = "condor.%s.err" % job['gridid']
                    condorOut = "condor.%s.out" % job['gridid']
                    condorLog = "condor.%s.log" % job['gridid']
                    exitCode = 99303
                    exitType = "NoJobReport"
                    for condorFile in [condorErr, condorOut, condorLog]:
                        condorFilePath = os.path.join(job['cache_dir'],
                                                      condorFile)
                        logOutput += "\n========== %s ==========\n" % condorFile
                        if os.path.isfile(condorFilePath):
                            logTail = BasicAlgos.tail(condorFilePath, 50)
                            logOutput += 'Adding end of %s to error message:\n\n' % condorFile
                            logOutput += logTail
                            logOutput += '\n\n'

                            if condorFile == condorLog:
                                # for condor log, search for the information
                                for matchObj in getIterMatchObjectOnRegexp(
                                        condorFilePath,
                                        CONDOR_LOG_FILTER_REGEXP):
                                    condorReason = matchObj.group("Reason")
                                    if condorReason:
                                        logOutput += condorReason
                                        if "SYSTEM_PERIODIC_REMOVE" in condorReason or "via condor_rm" in condorReason:
                                            exitCode = 99400
                                            exitType = "RemovedByGLIDEIN"
                                        else:
                                            exitCode = 99401

                                    siteName = matchObj.group("Site")
                                    if siteName:
                                        condorReport.data.siteName = siteName
                                    else:
                                        condorReport.data.siteName = "NoReportedSite"
                            else:
                                for matchObj in getIterMatchObjectOnRegexp(
                                        condorFilePath, WMEXCEPTION_REGEXP):
                                    errMsg = matchObj.group('WMException')
                                    if errMsg:
                                        logOutput += "\n\n%s\n" % errMsg

                                    errMsg = matchObj.group('ERROR')
                                    if errMsg:
                                        logOutput += "\n\n%s\n" % errMsg

                    logOutput += '\n\n'
                    condorReport.addError(exitType, exitCode, exitType,
                                          logOutput)
                else:
                    msg = "Serious Error in Completing condor job with id %s!\n" % job[
                        'id']
                    msg += "Could not find jobCache directory %s\n" % job[
                        'cache_dir']
                    msg += "Creating a new cache_dir for failed job report\n"
                    logging.error(msg)
                    os.makedirs(job['cache_dir'])
                    condorReport.addError("NoJobReport", 99304, "NoCacheDir",
                                          logOutput)

                condorReport.save(filename=reportName)

                logging.debug(
                    "Created failed job report for job with id %s and gridid %s",
                    job['id'], job['gridid'])

        return
    def complete(self, jobs):
        """
        Do any completion work required

        In this case, look for a returned logfile
        """

        for job in jobs:

            if job.get('cache_dir', None) is None or job.get('retry_count', None) is None:
                # Then we can't do anything
                logging.error("Can't find this job's cache_dir or retry count: %s", job)
                continue

            reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count'])
            if os.path.isfile(reportName) and os.path.getsize(reportName) > 0:
                # everything in order, move on
                continue
            elif os.path.isdir(reportName):
                # Then something weird has happened. Report error, do nothing
                logging.error("The job report for job with id %s and gridid %s is a directory", job['id'],
                              job['gridid'])
                logging.error("Ignoring this, but this is very strange")
            else:
                logging.error("No job report for job with id %s and gridid %s", job['id'], job['gridid'])

                if os.path.isfile(reportName):
                    os.remove(reportName)

                # create a report from scratch
                condorReport = Report()
                logOutput = 'Could not find jobReport\n'

                if os.path.isdir(job['cache_dir']):
                    condorErr = "condor.%s.err" % job['gridid']
                    condorOut = "condor.%s.out" % job['gridid']
                    condorLog = "condor.%s.log" % job['gridid']
                    exitCode = 99303
                    exitType = "NoJobReport"
                    for condorFile in [condorErr, condorOut, condorLog]:
                        condorFilePath = os.path.join(job['cache_dir'], condorFile)
                        logOutput += "\n========== %s ==========\n" % condorFile
                        if os.path.isfile(condorFilePath):
                            logTail = BasicAlgos.tail(condorFilePath, 50)
                            logOutput += 'Adding end of %s to error message:\n\n' % condorFile
                            logOutput += logTail
                            logOutput += '\n\n'

                            if condorFile == condorLog:
                                # for condor log, search for the information
                                for matchObj in getIterMatchObjectOnRegexp(condorFilePath, CONDOR_LOG_FILTER_REGEXP):
                                    condorReason = matchObj.group("Reason")
                                    if condorReason:
                                        logOutput += condorReason
                                        if "SYSTEM_PERIODIC_REMOVE" in condorReason or "via condor_rm" in condorReason:
                                            exitCode = 99400
                                            exitType = "RemovedByGLIDEIN"
                                        else:
                                            exitCode = 99401

                                    siteName = matchObj.group("Site")
                                    if siteName:
                                        condorReport.data.siteName = siteName
                                    else:
                                        condorReport.data.siteName = "NoReportedSite"
                            else:
                                for matchObj in getIterMatchObjectOnRegexp(condorFilePath, WMEXCEPTION_REGEXP):
                                    errMsg = matchObj.group('WMException')
                                    if errMsg:
                                        logOutput += "\n\n%s\n" % errMsg

                                    errMsg = matchObj.group('ERROR')
                                    if errMsg:
                                        logOutput += "\n\n%s\n" % errMsg

                    logOutput += '\n\n'
                    condorReport.addError(exitType, exitCode, exitType, logOutput)
                else:
                    msg = "Serious Error in Completing condor job with id %s!\n" % job['id']
                    msg += "Could not find jobCache directory %s\n" % job['cache_dir']
                    msg += "Creating a new cache_dir for failed job report\n"
                    logging.error(msg)
                    os.makedirs(job['cache_dir'])
                    condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput)

                condorReport.save(filename=reportName)

                logging.debug("Created failed job report for job with id %s and gridid %s", job['id'], job['gridid'])

        return