Ejemplo n.º 1
0
    def generateCreateFailedReports(self, createFailedJobs):
        """
        _generateCreateFailedReports_

        Create and store FWJR for the  jobs that failed on creation
        leaving meaningful information about what happened with them
        """
        if not createFailedJobs:
            return

        fjrsToSave = []
        for failedJob in createFailedJobs:
            report = Report()
            defaultMsg = "There is a condition which assures that this job will fail if it's submitted"
            report.addError("CreationFailure", 99305, "CreationFailure", failedJob.get("failedReason", defaultMsg))
            jobCache = failedJob.getCache()
            try:
                fjrPath = os.path.join(jobCache, "Report.0.pkl")
                report.save(fjrPath)
                fjrsToSave.append({"jobid": failedJob["id"], "fwjrpath": fjrPath})
                failedJob["fwjr"] = report
            except Exception:
                logging.error("Something went wrong while saving the report for  job %s", failedJob["id"])

        myThread = threading.currentThread()
        self.setFWJRPath.execute(binds=fjrsToSave, conn=myThread.transaction.conn, transaction=True)

        return
Ejemplo n.º 2
0
    def failJobs(self, failedJobs):
        """
        _failJobs_

        Dump those jobs that have failed due to timeout
        """
        if len(failedJobs) == 0:
            return

        jrBinds = []
        for job in failedJobs:
            # Make sure the job object goes packed with fwjr_path to be persisted in couch
            jrPath = os.path.join(job.getCache(), 'Report.%i.pkl' % (job['retry_count']))
            jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath})

            fwjr = Report()
            try:
                fwjr.load(jrPath)
            except Exception:
                # Something went wrong reading the pickle
                logging.error("The pickle in %s could not be loaded, generating a new one", jrPath)
                fwjr = Report()
                msg = "The job failed due to a timeout, unfortunately the original job report was lost"
                fwjr.addError("NoJobReport", 99303, "NoJobReport", msg)
                fwjr.save(jrPath)
            job["fwjr"] = fwjr

        myThread = threading.currentThread()
        myThread.transaction.begin()
        self.setFWJRAction.execute(binds=jrBinds, conn=myThread.transaction.conn, transaction=True)
        self.changeState.propagate(failedJobs, 'jobfailed', 'executing')
        logging.info("Failed %i jobs", len(failedJobs))
        myThread.transaction.commit()

        return
Ejemplo n.º 3
0
    def generateCreateFailedReports(self, createFailedJobs):
        """
        _generateCreateFailedReports_

        Create and store FWJR for the  jobs that failed on creation
        leaving meaningful information about what happened with them
        """
        if not createFailedJobs:
            return

        fjrsToSave = []
        for failedJob in createFailedJobs:
            report = Report()
            defaultMsg = "There is a condition which assures that this job will fail if it's submitted"
            report.addError("CreationFailure", 99305, "CreationFailure", failedJob.get("failedReason", defaultMsg))
            jobCache = failedJob.getCache()
            try:
                fjrPath = os.path.join(jobCache, "Report.0.pkl")
                report.save(fjrPath)
                fjrsToSave.append({"jobid": failedJob["id"], "fwjrpath": fjrPath})
                failedJob["fwjr"] = report
            except Exception:
                logging.error("Something went wrong while saving the report for  job %s" % failedJob["id"])

        myThread = threading.currentThread()
        self.setFWJRPath.execute(binds = fjrsToSave, conn = myThread.transaction.conn, transaction = True)

        return
Ejemplo n.º 4
0
    def complete(self, jobs):
        """
        Do any completion work required

        In this case, look for a returned logfile
        """

        for job in jobs:

            if job.get('cache_dir', None) is None or job.get('retry_count', None) is None:
                # Then we can't do anything
                logging.error("Can't find this job's cache_dir or retry count: %s", job)
                continue

            reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count'])
            if os.path.isfile(reportName) and os.path.getsize(reportName) > 0:
                # everything in order, move on
                continue
            elif os.path.isdir(reportName):
                # Then something weird has happened. Report error, do nothing
                logging.error("The job report for job with id %s and gridid %s is a directory", job['id'],
                              job['gridid'])
                logging.error("Ignoring this, but this is very strange")
            else:
                logging.error("No job report for job with id %s and gridid %s", job['id'], job['gridid'])

                if os.path.isfile(reportName):
                    os.remove(reportName)

                # create a report from scratch
                condorReport = Report()
                logOutput = 'Could not find jobReport\n'

                if os.path.isdir(job['cache_dir']):
                    condorOut = "condor.%s.out" % job['gridid']
                    condorErr = "condor.%s.err" % job['gridid']
                    condorLog = "condor.%s.log" % job['gridid']
                    for condorFile in [condorOut, condorErr, condorLog]:
                        condorFilePath = os.path.join(job['cache_dir'], condorFile)
                        if os.path.isfile(condorFilePath):
                            logTail = BasicAlgos.tail(condorFilePath, 50)
                            logOutput += 'Adding end of %s to error message:\n' % condorFile
                            logOutput += '\n'.join(logTail)
                    condorReport.addError("NoJobReport", 99303, "NoJobReport", logOutput)
                else:
                    msg = "Serious Error in Completing condor job with id %s!\n" % job['id']
                    msg += "Could not find jobCache directory %s\n" % job['cache_dir']
                    msg += "Creating a new cache_dir for failed job report\n"
                    logging.error(msg)
                    os.makedirs(job['cache_dir'])
                    condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput)

                condorReport.save(filename=reportName)

                logging.debug("Created failed job report for job with id %s and gridid %s", job['id'], job['gridid'])

        return
Ejemplo n.º 5
0
    def createMissingFWKJR(self, errorCode=999, errorDescription='Failure of unknown type'):
        """
        _createMissingFWJR_

        Create a missing FWJR if the report can't be found by the code in the
        path location.
        """
        report = Report()
        report.addError("cmsRun1", errorCode, "MissingJobReport", errorDescription)
        report.data.cmsRun1.status = "Failed"
        return report
Ejemplo n.º 6
0
    def createMissingFWKJR(self, errorCode=999, errorDescription='Failure of unknown type'):
        """
        _createMissingFWJR_

        Create a missing FWJR if the report can't be found by the code in the
        path location.
        """
        report = Report()
        report.addError("cmsRun1", 84, errorCode, errorDescription)
        report.data.cmsRun1.status = "Failed"
        return report
Ejemplo n.º 7
0
    def submit(self, jobs, info=None):
        """
        _submit_

        Submits jobs to the condor queue
        """
        successfulJobs = []
        failedJobs = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        schedd = htcondor.Schedd()

        # Submit the jobs
        for jobsReady in grouper(jobs, self.jobsPerSubmit):

            (sub, jobParams) = self.createSubmitRequest(jobsReady)

            logging.debug(
                "Start: Submitting %d jobs using Condor Python Submit",
                len(jobParams))
            try:
                with schedd.transaction() as txn:
                    submitRes = sub.queue_with_itemdata(
                        txn, 1, iter(jobParams))
                    clusterId = submitRes.cluster()
            except Exception as ex:
                logging.error("SimpleCondorPlugin job submission failed.")
                logging.exception(str(ex))
                logging.error(
                    "Moving on the the next batch of jobs and/or cycle....")

                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError",
                                           str(ex))
                for job in jobsReady:
                    job['fwjr'] = condorErrorReport
                    failedJobs.append(job)
            else:
                logging.debug(
                    "Job submission to condor succeeded, clusterId is %s",
                    clusterId)
                for index, job in enumerate(jobsReady):
                    job['gridid'] = "%s.%s" % (clusterId, index)
                    job['status'] = 'Idle'
                    successfulJobs.append(job)

        # We must return a list of jobs successfully submitted and a list of jobs failed
        logging.info(
            "Done submitting jobs for this cycle in SimpleCondorPlugin")
        return successfulJobs, failedJobs
Ejemplo n.º 8
0
    def submit(self, jobs, info=None):
        """
        _submit_

        Submits jobs to the condor queue
        """
        successfulJobs = []
        failedJobs = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        schedd = htcondor.Schedd()

        # Submit the jobs
        for jobsReady in grouper(jobs, self.jobsPerSubmit):

            clusterAd = self.getClusterAd()
            procAds = self.getProcAds(jobsReady)

            logging.debug(
                "Start: Submitting %d jobs using Condor Python SubmitMany",
                len(procAds))
            try:
                # 4th argument has to be None otherwise HTCondor leaks the result ads
                # through it (as of 8.7.x). More info in WMCore/#8729
                clusterId = schedd.submitMany(clusterAd, procAds, False, None)
            except Exception as ex:
                logging.error("SimpleCondorPlugin job submission failed.")
                logging.exception(str(ex))
                logging.error(
                    "Moving on the the next batch of jobs and/or cycle....")

                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError",
                                           str(ex))
                for job in jobsReady:
                    job['fwjr'] = condorErrorReport
                    failedJobs.append(job)
            else:
                logging.debug(
                    "Job submission to condor succeeded, clusterId is %s",
                    clusterId)
                for index, job in enumerate(jobsReady):
                    job['gridid'] = "%s.%s" % (clusterId, index)
                    job['status'] = 'Idle'
                    successfulJobs.append(job)

        # We must return a list of jobs successfully submitted and a list of jobs failed
        logging.info(
            "Done submitting jobs for this cycle in SimpleCondorPlugin")
        return successfulJobs, failedJobs
Ejemplo n.º 9
0
    def submit(self, jobs, info=None):
        """
        _submit_


        Submit jobs for one subscription
        """
        successfulJobs = []
        failedJobs = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        schedd = htcondor.Schedd()

        # Submit the jobs
        for jobsReady in grouper(jobs, self.jobsPerSubmit):

            clusterAd = self.getClusterAd()
            procAds = self.getProcAds(jobsReady)

            logging.debug(
                "Start: Submitting %d jobs using Condor Python SubmitMany",
                len(procAds))
            try:
                clusterId = schedd.submitMany(clusterAd, procAds)
            except Exception as ex:
                logging.error("SimpleCondorPlugin job submission failed.")
                logging.error(
                    "Moving on the the next batch of jobs and/or cycle....")
                logging.exception(ex)

                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError",
                                           str(ex))
                for job in jobsReady:
                    job['fwjr'] = condorErrorReport
                    failedJobs.append(job)
            else:
                logging.debug(
                    "Finish: Submitting jobs using Condor Python SubmitMany")
                for index, job in enumerate(jobsReady):
                    job['gridid'] = "%s.%s" % (clusterId, index)
                    job['status'] = 'Idle'
                    successfulJobs.append(job)

        # We must return a list of jobs successfully submitted and a list of jobs failed
        logging.info(
            "Done submitting jobs for this cycle in SimpleCondorPlugin")
        return successfulJobs, failedJobs
Ejemplo n.º 10
0
    def failJobs(self, failedJobs):
        """
        _failJobs_

        Dump those jobs that have failed due to timeout
        """

        myThread = threading.currentThread()

        if len(failedJobs) == 0:
            return

        myThread = threading.currentThread()


        # Load DAOs
        setFWJRAction = self.daoFactory(classname = "Jobs.SetFWJRPath")
        loadAction    = self.daoFactory(classname = "Jobs.LoadFromID")

        jrBinds = []

        for job in failedJobs:
            jrPath = os.path.join(job.getCache(),
                                  'Report.%i.pkl' % (job['retry_count']))
            jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath})
            #Make sure the job object goes packed with fwjr_path so it
            #can be persisted in couch

            fwjr = Report()
            try:
                fwjr.load(jrPath)
            except Exception:
                #Something went wrong reading the pickle
                logging.error("The pickle in %s could not be loaded, generating a new one" % jrPath)
                fwjr = Report()
                msg = "The job failed due to a timeout, unfortunately the original job report was lost"
                fwjr.addError("NoJobReport", 99303, "NoJobReport", msg)
                fwjr.save(jrPath)
            job["fwjr"] = fwjr
        
        # Set all paths at once
        myThread.transaction.begin()
        setFWJRAction.execute(binds = jrBinds)

        self.changeState.propagate(failedJobs, 'jobfailed', 'executing')
        logging.info("Failed %i jobs" % (len(failedJobs)))
        myThread.transaction.commit()


        return
Ejemplo n.º 11
0
    def createReport(self, outcome = 0):
        """
        Create a test report

        """

        jobReport = Report()
        jobReport.addStep('cmsRun1')
        jobReport.setStepStartTime(stepName = 'cmsRun1')
        jobReport.setStepStopTime(stepName = 'cmsRun1')
        if outcome:
            jobReport.addError('cmsRun1', 200, 'FakeError', 'FakeError')

        return jobReport
Ejemplo n.º 12
0
    def failJobs(self, failedJobs):
        """
        _failJobs_

        Dump those jobs that have failed due to timeout
        """

        myThread = threading.currentThread()

        if len(failedJobs) == 0:
            return

        myThread = threading.currentThread()

        # Load DAOs
        setFWJRAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        loadAction = self.daoFactory(classname="Jobs.LoadFromID")

        jrBinds = []

        for job in failedJobs:
            jrPath = os.path.join(job.getCache(),
                                  'Report.%i.pkl' % (job['retry_count']))
            jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath})
            #Make sure the job object goes packed with fwjr_path so it
            #can be persisted in couch

            fwjr = Report()
            try:
                fwjr.load(jrPath)
            except Exception:
                #Something went wrong reading the pickle
                logging.error(
                    "The pickle in %s could not be loaded, generating a new one"
                    % jrPath)
                fwjr = Report()
                msg = "The job failed due to a timeout, unfortunately the original job report was lost"
                fwjr.addError("NoJobReport", 99303, "NoJobReport", msg)
                fwjr.save(jrPath)
            job["fwjr"] = fwjr

        # Set all paths at once
        myThread.transaction.begin()
        setFWJRAction.execute(binds=jrBinds)

        self.changeState.propagate(failedJobs, 'jobfailed', 'executing')
        logging.info("Failed %i jobs" % (len(failedJobs)))
        myThread.transaction.commit()

        return
Ejemplo n.º 13
0
    def submit(self, jobs, info=None):
        """
        _submit_

        Submits jobs to the condor queue
        """
        successfulJobs = []
        failedJobs = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        schedd = htcondor.Schedd()

        # Submit the jobs
        for jobsReady in grouper(jobs, self.jobsPerSubmit):

            clusterAd = self.getClusterAd()
            procAds = self.getProcAds(jobsReady)

            logging.debug("Start: Submitting %d jobs using Condor Python SubmitMany", len(procAds))
            try:
                # 4th argument has to be None otherwise HTCondor leaks the result ads
                # through it (as of 8.7.x). More info in WMCore/#8729
                clusterId = schedd.submitMany(clusterAd, procAds, False, None)
            except Exception as ex:
                logging.error("SimpleCondorPlugin job submission failed.")
                logging.exception(str(ex))
                logging.error("Moving on the the next batch of jobs and/or cycle....")

                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex))
                for job in jobsReady:
                    job['fwjr'] = condorErrorReport
                    failedJobs.append(job)
            else:
                logging.debug("Job submission to condor succeeded, clusterId is %s", clusterId)
                for index, job in enumerate(jobsReady):
                    job['gridid'] = "%s.%s" % (clusterId, index)
                    job['status'] = 'Idle'
                    successfulJobs.append(job)

        # We must return a list of jobs successfully submitted and a list of jobs failed
        logging.info("Done submitting jobs for this cycle in SimpleCondorPlugin")
        return successfulJobs, failedJobs
Ejemplo n.º 14
0
    def testExitCode(self):
        """
        _testExitCode_

        Test and see if we can get an exit code out of a report

        Note: Errors without a return code return 99999
        """

        report = Report("cmsRun1")
        self.assertEqual(report.getExitCode(), 0)
        report.addError(stepName = "cmsRun1", exitCode = None, errorType = "test", errorDetails = "test")
        self.assertEqual(report.getExitCode(), 99999)
        self.assertEqual(report.getStepExitCode(stepName = "cmsRun1"), 99999)
        report.addError(stepName = "cmsRun1", exitCode = '12345', errorType = "test", errorDetails = "test")
        self.assertEqual(report.getExitCode(), 12345)
        self.assertEqual(report.getStepExitCode(stepName = "cmsRun1"), 12345)
Ejemplo n.º 15
0
    def testExitCode(self):
        """
        _testExitCode_

        Test and see if we can get an exit code out of a report

        Note: Errors without a return code return 99999
        """

        report = Report("cmsRun1")
        self.assertEqual(report.getExitCode(), 0)
        report.addError(stepName = "cmsRun1", exitCode = None, errorType = "test", errorDetails = "test")
        self.assertEqual(report.getExitCode(), 99999)
        self.assertEqual(report.getStepExitCode(stepName = "cmsRun1"), 99999)
        report.addError(stepName = "cmsRun1", exitCode = '12345', errorType = "test", errorDetails = "test")
        self.assertEqual(report.getExitCode(), 12345)
        self.assertEqual(report.getStepExitCode(stepName = "cmsRun1"), 12345)
Ejemplo n.º 16
0
    def submit(self, jobs, info=None):
        """
        _submit_


        Submit jobs for one subscription
        """
        successfulJobs = []
        failedJobs = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        schedd = htcondor.Schedd()

        # Submit the jobs
        for jobsReady in grouper(jobs, self.jobsPerSubmit):

            cluster_ad = self.getClusterAd()
            proc_ads = self.getProcAds(jobsReady)

            logging.debug("Start: Submitting %d jobs using Condor Python SubmitMany" % len(proc_ads))
            try:
                clusterId = schedd.submitMany(cluster_ad, proc_ads)
            except Exception as ex:
                logging.error("SimpleCondorPlugin job submission failed.")
                logging.error("Moving on the the next batch of jobs and/or cycle....")
                logging.exception(ex)

                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex))
                for job in jobsReady:
                    job['fwjr'] = condorErrorReport
                    failedJobs.append(job)
            else:
                logging.debug("Finish: Submitting jobs using Condor Python SubmitMany")
                for index,job in enumerate(jobsReady):
                    job['gridid'] = "%s.%s" % (clusterId, index)
                    job['status'] = 'Idle'
                    successfulJobs.append(job)

        # We must return a list of jobs successfully submitted and a list of jobs failed
        logging.info("Done submitting jobs for this cycle in SimpleCondorPlugin")
        return successfulJobs, failedJobs
Ejemplo n.º 17
0
    def failJobs(self, failedJobs):
        """
        _failJobs_

        Dump those jobs that have failed due to timeout
        """
        if len(failedJobs) == 0:
            return

        jrBinds = []
        for job in failedJobs:
            # Make sure the job object goes packed with fwjr_path to be persisted in couch
            jrPath = os.path.join(job.getCache(),
                                  'Report.%i.pkl' % (job['retry_count']))
            jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath})

            fwjr = Report()
            try:
                fwjr.load(jrPath)
            except Exception:
                # Something went wrong reading the pickle
                logging.error(
                    "The pickle in %s could not be loaded, generating a new one",
                    jrPath)
                fwjr = Report()
                fwjr.addError("NoJobReport", 99303, "NoJobReport",
                              WM_JOB_ERROR_CODES[99303])
                fwjr.save(jrPath)
            job["fwjr"] = fwjr

        myThread = threading.currentThread()
        myThread.transaction.begin()
        self.setFWJRAction.execute(binds=jrBinds,
                                   conn=myThread.transaction.conn,
                                   transaction=True)
        self.changeState.propagate(failedJobs, 'jobfailed', 'executing')
        logging.info("Failed %i jobs", len(failedJobs))
        myThread.transaction.commit()

        return
Ejemplo n.º 18
0
    def complete(self, jobs):
        """
        Do any completion work required

        In this case, look for a returned logfile
        """

        for job in jobs:
            if job.get("cache_dir", None) == None or job.get("retry_count", None) == None:
                # Then we can't do anything
                logging.error("Can't find this job's cache_dir in CondorPlugin.complete")
                logging.error("cache_dir: %s" % job.get("cache_dir", "Missing"))
                logging.error("retry_count: %s" % job.get("retry_count", "Missing"))
                continue
            reportName = os.path.join(job["cache_dir"], "Report.%i.pkl" % job["retry_count"])
            if os.path.isfile(reportName) and os.path.getsize(reportName) > 0:
                # Then we have a real report.
                # Do nothing
                continue
            if os.path.isdir(reportName):
                # Then something weird has happened.
                # File error, do nothing
                logging.error("Went to check on error report for job %i.  Found a directory instead.\n" % job["id"])
                logging.error("Ignoring this, but this is very strange.\n")

            # If we're still here, we must not have a real error report
            logOutput = "Could not find jobReport"
            logPath = os.path.join(job["cache_dir"], "condor.log")
            if os.path.isfile(logPath):
                logTail = BasicAlgos.tail(errLog, 50)
                logOutput += "Adding end of condor.log to error message:\n"
                logOutput += logTail
            condorReport = Report()
            condorReport.addError("NoJobReport", 61303, "NoJobReport", logOutput)
            condorReport.save(filename=reportName)
            logging.debug("No returning job report for job %i" % job["id"])

        return
Ejemplo n.º 19
0
                queueError = True
                continue

            if not exitCode == 0:
                logging.error("Condor returned non-zero.  Printing out command stderr")
                logging.error(error)
                errorCheck, errorMsg = parseError(error = error)
                logging.error("Processing failed jobs and proceeding to the next jobs.")
                logging.error("Do not restart component.")
            else:
                errorCheck = None

            if errorCheck:
                self.errorCount += 1
                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg)
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            job['fwjr'] = condorErrorReport
                            failedJobs.append(job)
                            break
            else:
                if self.errorCount > 0:
                    self.errorCount -= 1
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            successfulJobs.append(job)
                            break
Ejemplo n.º 20
0
    def complete(self, jobs):
        """
        Do any completion work required

        In this case, look for a returned logfile
        """

        for job in jobs:

            if job.get('cache_dir', None) is None or job.get(
                    'retry_count', None) is None:
                # Then we can't do anything
                logging.error(
                    "Can't find this job's cache_dir or retry count: %s", job)
                continue

            reportName = os.path.join(job['cache_dir'],
                                      'Report.%i.pkl' % job['retry_count'])
            if os.path.isfile(reportName) and os.path.getsize(reportName) > 0:
                # everything in order, move on
                continue
            elif os.path.isdir(reportName):
                # Then something weird has happened. Report error, do nothing
                logging.error(
                    "The job report for job with id %s and gridid %s is a directory",
                    job['id'], job['gridid'])
                logging.error("Ignoring this, but this is very strange")
            else:
                logging.error("No job report for job with id %s and gridid %s",
                              job['id'], job['gridid'])

                if os.path.isfile(reportName):
                    os.remove(reportName)

                # create a report from scratch
                condorReport = Report()
                logOutput = 'Could not find jobReport\n'

                if os.path.isdir(job['cache_dir']):
                    condorErr = "condor.%s.err" % job['gridid']
                    condorOut = "condor.%s.out" % job['gridid']
                    condorLog = "condor.%s.log" % job['gridid']
                    exitCode = 99303
                    exitType = "NoJobReport"
                    for condorFile in [condorErr, condorOut, condorLog]:
                        condorFilePath = os.path.join(job['cache_dir'],
                                                      condorFile)
                        logOutput += "\n========== %s ==========\n" % condorFile
                        if os.path.isfile(condorFilePath):
                            logTail = BasicAlgos.tail(condorFilePath, 50)
                            logOutput += 'Adding end of %s to error message:\n\n' % condorFile
                            logOutput += logTail
                            logOutput += '\n\n'

                            if condorFile == condorLog:
                                # for condor log, search for the information
                                for matchObj in getIterMatchObjectOnRegexp(
                                        condorFilePath,
                                        CONDOR_LOG_FILTER_REGEXP):
                                    condorReason = matchObj.group("Reason")
                                    if condorReason:
                                        logOutput += condorReason
                                        if "SYSTEM_PERIODIC_REMOVE" in condorReason or "via condor_rm" in condorReason:
                                            exitCode = 99400
                                            exitType = "RemovedByGLIDEIN"
                                        else:
                                            exitCode = 99401

                                    siteName = matchObj.group("Site")
                                    if siteName:
                                        condorReport.data.siteName = siteName
                                    else:
                                        condorReport.data.siteName = "NoReportedSite"
                            else:
                                for matchObj in getIterMatchObjectOnRegexp(
                                        condorFilePath, WMEXCEPTION_REGEXP):
                                    errMsg = matchObj.group('WMException')
                                    if errMsg:
                                        logOutput += "\n\n%s\n" % errMsg

                                    errMsg = matchObj.group('ERROR')
                                    if errMsg:
                                        logOutput += "\n\n%s\n" % errMsg

                    logOutput += '\n\n'
                    condorReport.addError(exitType, exitCode, exitType,
                                          logOutput)
                else:
                    msg = "Serious Error in Completing condor job with id %s!\n" % job[
                        'id']
                    msg += "Could not find jobCache directory %s\n" % job[
                        'cache_dir']
                    msg += "Creating a new cache_dir for failed job report\n"
                    logging.error(msg)
                    os.makedirs(job['cache_dir'])
                    condorReport.addError("NoJobReport", 99304, "NoCacheDir",
                                          logOutput)

                condorReport.save(filename=reportName)

                logging.debug(
                    "Created failed job report for job with id %s and gridid %s",
                    job['id'], job['gridid'])

        return
Ejemplo n.º 21
0
    def complete(self, jobs):
        """
        Do any completion work required

        In this case, look for a returned logfile
        """

        for job in jobs:
            if job.get('cache_dir', None) == None or job.get('retry_count', None) == None:
                # Then we can't do anything
                logging.error("Can't find this job's cache_dir in CondorPlugin.complete")
                logging.error("cache_dir: %s" % job.get('cache_dir', 'Missing'))
                logging.error("retry_count: %s" % job.get('retry_count', 'Missing'))
                continue
            reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count'])
            if os.path.isfile(reportName) and os.path.getsize(reportName) > 0:
                # Then we have a real report.
                # Do nothing
                continue
            if os.path.isdir(reportName):
                # Then something weird has happened.
                # File error, do nothing
                logging.error("Went to check on error report for job %i.  Found a directory instead.\n" % job['id'])
                logging.error("Ignoring this, but this is very strange.\n")

            # If we're still here, we must not have a real error report
            logOutput = 'Could not find jobReport\n'
            #But we don't know exactly the condor id, so it will append
            #the last lines of the latest condor log in cache_dir
            genLogPath = os.path.join(job['cache_dir'], 'condor.*.*.log')
            logPaths = glob.glob(genLogPath)
            errLog = None
            if len(logPaths):
                errLog = max(logPaths, key = lambda path :
                                                    os.stat(path).st_mtime)
            if errLog != None and os.path.isfile(errLog):
                logTail = BasicAlgos.tail(errLog, 50)
                logOutput += 'Adding end of condor.log to error message:\n'
                logOutput += '\n'.join(logTail)
            if not os.path.isdir(job['cache_dir']):
                msg =  "Serious Error in Completing condor job with id %s!\n" % job.get('id', 'unknown')
                msg += "Could not find jobCache directory - directory deleted under job: %s\n" % job['cache_dir']
                msg += "Creating artificial cache_dir for failed job report\n"
                logging.error(msg)
                os.makedirs(job['cache_dir'])
                logOutput += msg
                condorReport = Report()
                condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput)
                condorReport.save(filename = reportName)
                continue
            condorReport = Report()
            condorReport.addError("NoJobReport", 99303, "NoJobReport", logOutput)
            if os.path.isfile(reportName):
                # Then we have a file already there.  It should be zero size due
                # to the if statements above, but we should remove it.
                if os.path.getsize(reportName) > 0:
                    # This should never happen.  If it does, ignore it
                    msg =  "Critical strange problem.  FWJR changed size while being processed."
                    logging.error(msg)
                else:
                    try:
                        os.remove(reportName)
                        condorReport.save(filename = reportName)
                    except Exception as ex:
                        logging.error("Cannot remove and replace empty report %s" % reportName)
                        logging.error("Report continuing without error!")
            else:
                condorReport.save(filename = reportName)

            # Debug message to end loop
            logging.debug("No returning job report for job %i" % job['id'])


        return
Ejemplo n.º 22
0
                queueError = True
                continue

            if not exitCode == 0:
                logging.error("Condor returned non-zero.  Printing out command stderr")
                logging.error(error)
                errorCheck, errorMsg = parseError(error = error)
                logging.error("Processing failed jobs and proceeding to the next jobs.")
                logging.error("Do not restart component.")
            else:
                errorCheck = None

            if errorCheck:
                self.errorCount += 1
                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg)
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            job['fwjr'] = condorErrorReport
                            failedJobs.append(job)
                            break
            else:
                if self.errorCount > 0:
                    self.errorCount -= 1
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            successfulJobs.append(job)
                            break
Ejemplo n.º 23
0
    def submit(self, jobs, info=None):
        """
        _submit_

        Submit jobs for one subscription

        """
        # If we're here, then we have submitter components
        self.scriptFile = self.config.JobSubmitter.submitScript
        self.queue = self.config.JobSubmitter.LsfPluginQueue
        self.resourceReq = getattr(self.config.JobSubmitter,
                                   'LsfPluginResourceReq', None)
        self.jobGroup = self.config.JobSubmitter.LsfPluginJobGroup
        self.batchOutput = getattr(self.config.JobSubmitter,
                                   'LsfPluginBatchOutput', None)

        successfulJobs = []
        failedJobs = []

        if len(jobs) == 0:
            # Then we have nothing to do
            return successfulJobs, failedJobs

        # Now assume that what we get is the following; a mostly
        # unordered list of jobs with random sandboxes.
        # We intend to sort them by sandbox.

        submitDict = {}
        for job in jobs:
            sandbox = job['sandbox']
            if not sandbox in submitDict.keys():
                submitDict[sandbox] = []
            submitDict[sandbox].append(job)

        # Now submit the bastards
        for sandbox in submitDict.keys():
            jobList = submitDict.get(sandbox, [])
            while len(jobList) > 0:
                jobsReady = jobList[:self.config.JobSubmitter.jobsPerWorker]
                jobList = jobList[self.config.JobSubmitter.jobsPerWorker:]

                for job in jobsReady:

                    if job == {}:
                        # Then I don't know how we got here either
                        logging.error(
                            "Was passed a nonexistant job.  Ignoring")
                        continue

                    submitScript = self.makeSubmit(job)

                    if not submitScript:
                        # Then we got nothing
                        logging.error("No submit script made!")
                        return {'NoResult': [0]}

                    submitScriptFile = os.path.join(job['cache_dir'],
                                                    "submit.sh")
                    handle = open(submitScriptFile, 'w')
                    handle.writelines(submitScript)
                    handle.close()

                    # make reasonable job name
                    jobName = "WMAgentJob"
                    regExpParser = re.compile(
                        '.*/JobCreator/JobCache/([^/]+)/[^/]+/.*')
                    match = regExpParser.match(job['cache_dir'])
                    if (match != None):
                        jobName = "%s-%s" % (match.group(1), job['id'])

                    # //
                    # // Submit LSF job
                    # //
                    command = 'bsub'
                    command += ' -q %s' % self.queue

                    if self.resourceReq != None:
                        command += ' -R "%s"' % self.resourceReq

                    command += ' -g %s' % self.jobGroup
                    command += ' -J %s' % jobName

                    lsfLogDir = self.batchOutput
                    if lsfLogDir != None:
                        now = datetime.datetime.today()
                        lsfLogDir += '/%s' % now.strftime("%Y%m%d%H")
                        try:
                            os.mkdir(lsfLogDir)
                            logging.debug("Created directory %s" % lsfLogDir)
                        except OSError, err:
                            # suppress LSF log unless it's about an already exisiting directory
                            if err.errno != errno.EEXIST or not os.path.isdir(
                                    lsfLogDir):
                                logging.error(
                                    "Can't create directory %s, turning off LSF log"
                                    % lsfLogDir)
                                lsfLogDir = None

                    if lsfLogDir == None:
                        command += ' -oo /dev/null'
                    else:
                        command += ' -oo %s/%s.%%J.out' % (lsfLogDir, jobName)

                    command += ' < %s' % submitScriptFile

                    logging.info("Submitting LSF job: %s" % command)

                    p = subprocess.Popen(command,
                                         shell=True,
                                         stdout=subprocess.PIPE,
                                         stderr=subprocess.STDOUT)

                    stdout = p.communicate()[0]
                    returncode = p.returncode

                    if returncode == 0:
                        # check for correct naming convention in PFN
                        regExpParser = re.compile(
                            'Job <([0-9]+)> is submitted to queue')
                        match = regExpParser.match(stdout)
                        if match != None:
                            job['gridid'] = match.group(1)
                            successfulJobs.append(job)
                            logging.info("LSF Job ID : %s" % job['gridid'])
                            continue
                        else:
                            logging.error(
                                "bsub didn't return a valid Job ID. Job is not submitted"
                            )
                            logging.error(stdout)

                    lsfErrorReport = Report()
                    lsfErrorReport.addError("JobSubmit", 61202, "LsfError",
                                            stdout)
                    job['fwjr'] = lsfErrorReport
                    failedJobs.append(job)
Ejemplo n.º 24
0
    def submit(self, jobs, info):
        """
        _submit_

        
        Submit jobs for one subscription
        """

        if len(self.pool) == 0:
            # Starting things up
            # This is obviously a submit API
            for x in range(self.nProcess):
                p = multiprocessing.Process(target=submitWorker, args=(self.input, self.result))
                p.start()
                self.pool.append(p)

        # If we're here, then we have submitter components
        self.scriptFile = self.config.JobSubmitter.submitScript
        self.submitDir = self.config.JobSubmitter.submitDir
        timeout = getattr(self.config.JobSubmitter, "getTimeout", 300)

        if not os.path.exists(self.submitDir):
            os.makedirs(self.submitDir)

        successfulJobs = []
        failedJobs = []
        jdlFiles = []

        if len(jobs) == 0:
            # Then we have nothing to do
            return successfulJobs, failedJobs

        # Now assume that what we get is the following; a mostly
        # unordered list of jobs with random sandboxes.
        # We intend to sort them by sandbox.

        submitDict = {}
        nSubmits = 0
        for job in jobs:
            sandbox = job["sandbox"]
            if not sandbox in submitDict.keys():
                submitDict[sandbox] = []
            submitDict[sandbox].append(job)

        # Now submit the bastards
        for sandbox in submitDict.keys():
            jobList = submitDict.get(sandbox, [])
            idList = [x["jobid"] for x in jobList]
            while len(jobList) > 0:
                jobsReady = jobList[: self.config.JobSubmitter.jobsPerWorker]
                jobList = jobList[self.config.JobSubmitter.jobsPerWorker :]
                idList = [x["id"] for x in jobsReady]
                jdlList = self.makeSubmit(jobList=jobsReady)
                if not jdlList or jdlList == []:
                    # Then we got nothing
                    logging.error("No JDL file made!")
                    return {"NoResult": [0]}
                jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0])
                handle = open(jdlFile, "w")
                handle.writelines(jdlList)
                handle.close()
                jdlFiles.append(jdlFile)

                # Now submit them
                logging.info("About to submit %i jobs" % (len(jobsReady)))
                command = "condor_submit %s" % jdlFile
                self.input.put({"command": command, "idList": idList})
                nSubmits += 1

        # Now we should have sent all jobs to be submitted
        # Going to do the rest of it now
        for n in range(nSubmits):
            res = self.result.get(block=True, timeout=timeout)
            output = res["stdout"]
            error = res["stderr"]
            idList = res["idList"]

            if not error == "":
                logging.error("Printing out command stderr")
                logging.error(error)

            errorCheck, errorMsg = parseError(error=error)

            if errorCheck:
                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg)
                for jobID in idList:
                    for job in jobs:
                        if job.get("id", None) == jobID:
                            job["fwjr"] = condorErrorReport
                            failedJobs.append(job)
                            break
            else:
                for jobID in idList:
                    for job in jobs:
                        if job.get("id", None) == jobID:
                            successfulJobs.append(job)
                            break

        # Remove JDL files unless commanded otherwise
        if getattr(self.config.JobSubmitter, "deleteJDLFiles", True):
            for f in jdlFiles:
                os.remove(f)

        # We must return a list of jobs successfully submitted,
        # and a list of jobs failed
        return successfulJobs, failedJobs
Ejemplo n.º 25
0
    def kill(self, jobs, killMsg = None, errorCode = 61300):
        """
        _kill_

        Kill jobs using plugin functions:

        Only active jobs (status = 1) will be killed
        An optional killMsg can be sent; this will be written
        into the job FWJR. The errorCode will be the one specified
        and if no killMsg is provided then a standard message associated with the
        exit code will be used.
        If a previous FWJR exists, this error will be appended to it.
        """
        if not len(jobs):
            # Nothing to do here
            return
        self.check()
        jobsToKill = {}

        # Now get a list of which jobs are in the batch system
        # only kill jobs present there
        loadedJobs = self._buildRunningJobs(wmbsJobs = jobs)

        for runningJob in loadedJobs:
            plugin = runningJob['plugin']
            if not plugin in jobsToKill.keys():
                jobsToKill[plugin] = []
            jobsToKill[plugin].append(runningJob)

        for plugin in jobsToKill.keys():
            if not plugin in self.plugins.keys():
                msg =  "Jobs tracking with non-existant plugin %s\n" % (plugin)
                msg += "They were submitted but can't be tracked?\n"
                msg += "That's too strange to continue\n"
                logging.error(msg)
                raise BossAirException(msg)
            else:
                # Then we send them to the plugins
                try:
                    pluginInst = self.plugins[plugin]
                    pluginInst.kill(jobs = jobsToKill[plugin])
                    # Register the killed jobs
                    for job in jobsToKill[plugin]:
                        if job.get('cache_dir', None) == None or job.get('retry_count', None) == None:
                            continue
                        # Try to save an error report as the jobFWJR
                        if not os.path.isdir(job['cache_dir']):
                            # Then we have a bad cache directory
                            logging.error("Could not write a kill FWJR due to non-existant cache_dir for job %i\n" % job['id'])
                            logging.debug("cache_dir: %s\n" % job['cache_dir'])
                            continue
                        reportName = os.path.join(job['cache_dir'],
                                                      'Report.%i.pkl' % job['retry_count'])
                        errorReport = Report()
                        if os.path.exists(reportName) and os.path.getsize(reportName) > 0:
                            # Then there's already a report there.  Add messages
                            errorReport.load(reportName)
                        # Build a better job message
                        if killMsg:
                            reportedMsg = killMsg
                            reportedMsg += '\n Job last known status was: %s' % job.get('globalState', 'Unknown')
                        else:
                            reportedMsg = WMJobErrorCodes[errorCode]
                            reportedMsg += '\n Job last known status was: %s' % job.get('globalState', 'Unknown')
                        errorReport.addError("JobKilled", errorCode, "JobKilled", reportedMsg)
                        try:
                            errorReport.save(filename = reportName)
                        except IOError as ioe:
                            logging.warning('Cannot write report %s because of %s' % (reportName, ioe))
                except WMException:
                    raise
                except Exception as ex:
                    msg =  "Unhandled exception while calling kill method for plugin %s\n" % plugin
                    msg += str(ex)
                    logging.error(msg)
                    logging.debug("Interrupted while killing following jobs: %s\n" % jobsToKill[plugin])
                    raise BossAirException(msg)
                finally:
                    # Even if kill fails, complete the jobs
                    self._complete(jobs = jobsToKill[plugin])
        return
Ejemplo n.º 26
0
    def execute(self, emulator = None):
        """
        _execute_


        """
        #Are we using emulators again?
        if (emulator != None):
            return emulator.emulate( self.step, self.job )


        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to two hours per retry
        # this alarm leaves a subprocess behing that may cause trouble, see #6273
        waitTime = overrides.get('waitTime', 7200 * self.step.retryCount)

        logging.info("StageOut override is: %s ", self.step)

        # Pull out StageOutMgr Overrides

        # switch between old stageOut behavior and new, fancy stage out behavior
        useNewStageOutCode = False
        if getattr(self.step, 'newStageout', False) or \
            ('newStageOut' in overrides and overrides.get('newStageOut')):
            useNewStageOutCode = True


        stageOutCall = {}
        if "command" in overrides and "option" in overrides \
               and "phedex-node" in overrides \
               and"lfn-prefix" in overrides:
            logging.critical('using override in StageOut')
            stageOutCall['command']    = overrides.get('command')
            stageOutCall['option']     = overrides.get('option')
            stageOutCall['phedex-node']= overrides.get('phedex-node')
            stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix')

        # naw man, this is real
        # iterate over all the incoming files
        if not useNewStageOutCode:
            # old style
            manager = StageOutMgr(**stageOutCall)
            manager.numberOfRetries = self.step.retryCount
            manager.retryPauseTime  = self.step.retryDelay
        else:
            # new style
            logging.critical("STAGEOUT IS USING NEW STAGEOUT CODE")
            print("STAGEOUT IS USING NEW STAGEOUT CODE")
            manager = FMStageOutMgr(retryPauseTime  = self.step.retryDelay,
                                    numberOfRetries = self.step.retryCount,
                                    **stageOutCall)

        # We need to find a list of steps in our task
        # And eventually a list of jobReports for out steps

        # Search through steps for report files
        filesTransferred = []

        for step in self.stepSpace.taskSpace.stepSpaces():
            if step == self.stepName:
                #Don't try to parse your own report; it's not there yet
                continue
            stepLocation = os.path.join(self.stepSpace.taskSpace.location, step)
            logging.info("Beginning report processing for step %s", step)
            reportLocation = os.path.join(stepLocation, 'Report.pkl')
            if not os.path.isfile(reportLocation):
                logging.error("Cannot find report for step %s in space %s", step, stepLocation)
                continue
            # First, get everything from a file and 'unpersist' it
            stepReport = Report()
            stepReport.unpersist(reportLocation, step)

            # Don't stage out files from bad steps.
            if not stepReport.stepSuccessful(step):
                continue

            # Okay, time to start using stuff
            # Now I'm a bit confused about this; each report should ONLY
            # Have the results of that particular step in it,
            # So getting all the files should get ONLY the files
            # for that step; or so I hope
            files = stepReport.getAllFileRefsFromStep(step = step)
            for fileName in files:

                # make sure the file information is consistent
                if hasattr(fileName, 'pfn') and ( not hasattr(fileName, 'lfn') or not hasattr(fileName, 'module_label') ):
                    msg = "Not a valid file: %s" % fileName
                    logging.error(msg)
                    continue

                # Figuring out if we should do straight to merge
                #  - should we do straight to merge at all ?
                #  - is straight to merge disabled for this output ?
                #  - are we over the size threshold
                #  - are we over the event threshold ?
                straightToMerge = False
                if not getattr(fileName, 'merged', False) and hasattr(self.step.output, 'minMergeSize'):
                    if fileName.module_label not in getattr(self.step.output, 'forceUnmergedOutputs', []):
                        if getattr(fileName, 'size', 0) >= self.step.output.minMergeSize:
                            straightToMerge = True
                        if getattr(fileName, 'events', 0) >= getattr(self.step.output, 'maxMergeEvents', sys.maxsize):
                            straightToMerge = True

                if straightToMerge:

                    try:
                        fileName = self.handleLFNForMerge(mergefile = fileName,
                                                          step = step)
                    except Exception as ex:
                        logging.info("minMergeSize: %s", getattr(self.step.output, 'minMergeSize', None))
                        logging.info("maxMergeEvents: %s", getattr(self.step.output, 'maxMergeEvents', None))
                        logging.error("Encountered error while handling LFN for merge %s", fileName)
                        logging.error(str(ex))
                        manager.cleanSuccessfulStageOuts()
                        stepReport.addError(self.stepName, 60401, "DirectToMergeFailure", str(ex))

                # Save the input PFN in case we need it
                # Undecided whether to move fileName.pfn to the output PFN
                fileName.InputPFN = fileName.pfn
                lfn = getattr(fileName, 'lfn')
                fileSource = getattr(fileName, 'Source', None)
                if fileSource in ['TFileService', 'UserDefined']:
                    userLfnRegEx(lfn)
                else:
                    lfnRegEx(lfn)

                fileForTransfer = {'LFN': lfn,
                                   'PFN': getattr(fileName, 'pfn'),
                                   'PNN' : None,
                                   'StageOutCommand': None,
                                   'Checksums' : getattr(fileName, 'checksums', None)}

                signal.signal(signal.SIGALRM, alarmHandler)
                signal.alarm(waitTime)
                try:
                    manager(fileForTransfer)
                    #Afterwards, the file should have updated info.
                    filesTransferred.append(fileForTransfer)
                    fileName.StageOutCommand = fileForTransfer['StageOutCommand']
                    fileName.location        = fileForTransfer['PNN']
                    fileName.OutputPFN       = fileForTransfer['PFN']
                except Alarm:
                    msg = "Indefinite hang during stageOut of logArchive"
                    logging.error(msg)
                    manager.cleanSuccessfulStageOuts()
                    stepReport.addError(self.stepName, 60403, "StageOutTimeout", msg)
                    stepReport.setStepStatus(self.stepName, 1)
                    # well, if it fails for one file, it fails for the whole job...
                    break
                except Exception as ex:
                    manager.cleanSuccessfulStageOuts()
                    stepReport.addError(self.stepName, 60307, "StageOutFailure", str(ex))
                    stepReport.setStepStatus(self.stepName, 1)
                    stepReport.persist(reportLocation)
                    raise

                signal.alarm(0)

            # Am DONE with report. Persist it
            stepReport.persist(reportLocation)

        #Done with all steps, and should have a list of
        #stagedOut files in fileForTransfer
        logging.info("Transferred %i files", len(filesTransferred))
        return
Ejemplo n.º 27
0
    def execute(self, emulator=None):
        """
        _execute_


        """
        #Are we using emulators again?
        if (emulator != None):
            return emulator.emulate(self.step, self.job)

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to two hours per retry
        # this alarm leaves a subprocess behing that may cause trouble, see #6273
        waitTime = overrides.get('waitTime', 7200 * self.step.retryCount)

        logging.info("StageOut override is: %s ", self.step)

        # Pull out StageOutMgr Overrides

        # switch between old stageOut behavior and new, fancy stage out behavior
        useNewStageOutCode = False
        if getattr(self.step, 'newStageout', False) or \
            ('newStageOut' in overrides and overrides.get('newStageOut')):
            useNewStageOutCode = True

        stageOutCall = {}
        if "command" in overrides and "option" in overrides \
               and "phedex-node" in overrides \
               and"lfn-prefix" in overrides:
            logging.critical('using override in StageOut')
            stageOutCall['command'] = overrides.get('command')
            stageOutCall['option'] = overrides.get('option')
            stageOutCall['phedex-node'] = overrides.get('phedex-node')
            stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix')

        # naw man, this is real
        # iterate over all the incoming files
        if not useNewStageOutCode:
            # old style
            manager = StageOutMgr(**stageOutCall)
            manager.numberOfRetries = self.step.retryCount
            manager.retryPauseTime = self.step.retryDelay
        else:
            # new style
            logging.critical("STAGEOUT IS USING NEW STAGEOUT CODE")
            print("STAGEOUT IS USING NEW STAGEOUT CODE")
            manager = FMStageOutMgr(retryPauseTime=self.step.retryDelay,
                                    numberOfRetries=self.step.retryCount,
                                    **stageOutCall)

        # We need to find a list of steps in our task
        # And eventually a list of jobReports for out steps

        # Search through steps for report files
        filesTransferred = []

        for step in self.stepSpace.taskSpace.stepSpaces():
            if step == self.stepName:
                #Don't try to parse your own report; it's not there yet
                continue
            stepLocation = os.path.join(self.stepSpace.taskSpace.location,
                                        step)
            logging.info("Beginning report processing for step %s", step)
            reportLocation = os.path.join(stepLocation, 'Report.pkl')
            if not os.path.isfile(reportLocation):
                logging.error("Cannot find report for step %s in space %s",
                              step, stepLocation)
                continue
            # First, get everything from a file and 'unpersist' it
            stepReport = Report()
            stepReport.unpersist(reportLocation, step)

            # Don't stage out files from bad steps.
            if not stepReport.stepSuccessful(step):
                continue

            # Okay, time to start using stuff
            # Now I'm a bit confused about this; each report should ONLY
            # Have the results of that particular step in it,
            # So getting all the files should get ONLY the files
            # for that step; or so I hope
            files = stepReport.getAllFileRefsFromStep(step=step)
            for fileName in files:

                # make sure the file information is consistent
                if hasattr(fileName,
                           'pfn') and (not hasattr(fileName, 'lfn') or
                                       not hasattr(fileName, 'module_label')):
                    msg = "Not a valid file: %s" % fileName
                    logging.error(msg)
                    continue

                # Figuring out if we should do straight to merge
                #  - should we do straight to merge at all ?
                #  - is straight to merge disabled for this output ?
                #  - are we over the size threshold
                #  - are we over the event threshold ?
                straightToMerge = False
                if not getattr(fileName, 'merged', False) and hasattr(
                        self.step.output, 'minMergeSize'):
                    if fileName.module_label not in getattr(
                            self.step.output, 'forceUnmergedOutputs', []):
                        if getattr(fileName, 'size',
                                   0) >= self.step.output.minMergeSize:
                            straightToMerge = True
                        if getattr(fileName, 'events', 0) >= getattr(
                                self.step.output, 'maxMergeEvents',
                                sys.maxsize):
                            straightToMerge = True

                if straightToMerge:

                    try:
                        fileName = self.handleLFNForMerge(mergefile=fileName,
                                                          step=step)
                    except Exception as ex:
                        logging.info(
                            "minMergeSize: %s",
                            getattr(self.step.output, 'minMergeSize', None))
                        logging.info(
                            "maxMergeEvents: %s",
                            getattr(self.step.output, 'maxMergeEvents', None))
                        logging.error(
                            "Encountered error while handling LFN for merge %s",
                            fileName)
                        logging.error(str(ex))
                        manager.cleanSuccessfulStageOuts()
                        stepReport.addError(self.stepName, 60401,
                                            "DirectToMergeFailure", str(ex))

                # Save the input PFN in case we need it
                # Undecided whether to move fileName.pfn to the output PFN
                fileName.InputPFN = fileName.pfn
                lfn = getattr(fileName, 'lfn')
                fileSource = getattr(fileName, 'Source', None)
                if fileSource in ['TFileService', 'UserDefined']:
                    userLfnRegEx(lfn)
                else:
                    lfnRegEx(lfn)

                fileForTransfer = {
                    'LFN': lfn,
                    'PFN': getattr(fileName, 'pfn'),
                    'PNN': None,
                    'StageOutCommand': None,
                    'Checksums': getattr(fileName, 'checksums', None)
                }

                signal.signal(signal.SIGALRM, alarmHandler)
                signal.alarm(waitTime)
                try:
                    manager(fileForTransfer)
                    #Afterwards, the file should have updated info.
                    filesTransferred.append(fileForTransfer)
                    fileName.StageOutCommand = fileForTransfer[
                        'StageOutCommand']
                    fileName.location = fileForTransfer['PNN']
                    fileName.OutputPFN = fileForTransfer['PFN']
                except Alarm:
                    msg = "Indefinite hang during stageOut of logArchive"
                    logging.error(msg)
                    manager.cleanSuccessfulStageOuts()
                    stepReport.addError(self.stepName, 60403,
                                        "StageOutTimeout", msg)
                    stepReport.setStepStatus(self.stepName, 1)
                    # well, if it fails for one file, it fails for the whole job...
                    break
                except Exception as ex:
                    manager.cleanSuccessfulStageOuts()
                    stepReport.addError(self.stepName, 60307,
                                        "StageOutFailure", str(ex))
                    stepReport.setStepStatus(self.stepName, 1)
                    stepReport.persist(reportLocation)
                    raise

                signal.alarm(0)

            # Am DONE with report. Persist it
            stepReport.persist(reportLocation)

        #Done with all steps, and should have a list of
        #stagedOut files in fileForTransfer
        logging.info("Transferred %i files", len(filesTransferred))
        return
Ejemplo n.º 28
0
    def testExitCode(self):
        """
        _testExitCode_

        Test and see if we can get an exit code out of a report

        Note: Errors without a return code return 99999
        getStepExitCode: returns the first valid and non-zero exit code
        getExitCode: uses the method above to get an exit code
        getStepExitCodes: returns a set of all exit codes within the step
        """

        report = Report("cmsRun1")
        self.assertEqual(report.getExitCode(), 0)
        self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 0)
        self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {})
        self.assertItemsEqual(report.getStepErrors(stepName="cmsRun1"), {})

        report.addError(stepName="cmsRun1",
                        exitCode=None,
                        errorType="test",
                        errorDetails="test")
        # None is not a valid exitCode, but it will get mapped to 99999
        self.assertEqual(report.getExitCode(), 99999)
        self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 99999)
        self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"),
                              {99999})
        self.assertEqual(
            report.getStepErrors(stepName="cmsRun1")['errorCount'], 1)

        report.addError(stepName="cmsRun1",
                        exitCode=12345,
                        errorType="test",
                        errorDetails="test")
        self.assertEqual(report.getExitCode(), 12345)
        self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 12345)
        self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"),
                              {99999, 12345})
        self.assertEqual(
            report.getStepErrors(stepName="cmsRun1")['errorCount'], 2)

        report.addError(stepName="cmsRun1",
                        exitCode=123,
                        errorType="test",
                        errorDetails="test")
        self.assertEqual(report.getExitCode(), 12345)
        self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 12345)
        self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"),
                              {99999, 12345, 123})
        self.assertEqual(
            report.getStepErrors(stepName="cmsRun1")['errorCount'], 3)

        # now try to record the same exit code once again
        report.addError(stepName="cmsRun1",
                        exitCode=12345,
                        errorType="test",
                        errorDetails="test")
        self.assertEqual(report.getExitCode(), 12345)
        self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 12345)
        self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"),
                              {99999, 12345, 123})
        self.assertEqual(
            report.getStepErrors(stepName="cmsRun1")['errorCount'], 3)

        # and once again, but different type and details (which does not matter)
        report.addError(stepName="cmsRun1",
                        exitCode=12345,
                        errorType="testAA",
                        errorDetails="testAA")
        self.assertEqual(report.getExitCode(), 12345)
        self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 12345)
        self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"),
                              {99999, 12345, 123})
        self.assertEqual(
            report.getStepErrors(stepName="cmsRun1")['errorCount'], 3)
Ejemplo n.º 29
0
    def complete(self, jobs):
        """
        Do any completion work required

        In this case, look for a returned logfile
        """

        for job in jobs:
            if job.get('cache_dir', None) == None or job.get('retry_count', None) == None:
                # Then we can't do anything
                logging.error("Can't find this job's cache_dir in CondorPlugin.complete")
                logging.error("cache_dir: %s" % job.get('cache_dir', 'Missing'))
                logging.error("retry_count: %s" % job.get('retry_count', 'Missing'))
                continue
            reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count'])
            if os.path.isfile(reportName) and os.path.getsize(reportName) > 0:
                # Then we have a real report.
                # Do nothing
                continue
            if os.path.isdir(reportName):
                # Then something weird has happened.
                # File error, do nothing
                logging.error("Went to check on error report for job %i.  Found a directory instead.\n" % job['id'])
                logging.error("Ignoring this, but this is very strange.\n")

            # If we're still here, we must not have a real error report
            logOutput = 'Could not find jobReport\n'
            #But we don't know exactly the condor id, so it will append
            #the last lines of the latest condor log in cache_dir
            genLogPath = os.path.join(job['cache_dir'], 'condor.*.*.log')
            logPaths = glob.glob(genLogPath)
            errLog = None
            if len(logPaths):
                errLog = max(logPaths, key = lambda path :
                                                    os.stat(path).st_mtime)
            if errLog != None and os.path.isfile(errLog):
                logTail = BasicAlgos.tail(errLog, 50)
                logOutput += 'Adding end of condor.log to error message:\n'
                logOutput += '\n'.join(logTail)
            if not os.path.isdir(job['cache_dir']):
                msg =  "Serious Error in Completing condor job with id %s!\n" % job.get('id', 'unknown')
                msg += "Could not find jobCache directory - directory deleted under job: %s\n" % job['cache_dir']
                msg += "Creating artificial cache_dir for failed job report\n"
                logging.error(msg)
                os.makedirs(job['cache_dir'])
                logOutput += msg
                condorReport = Report()
                condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput)
                condorReport.save(filename = reportName)
                continue
            condorReport = Report()
            condorReport.addError("NoJobReport", 99303, "NoJobReport", logOutput)
            if os.path.isfile(reportName):
                # Then we have a file already there.  It should be zero size due
                # to the if statements above, but we should remove it.
                if os.path.getsize(reportName) > 0:
                    # This should never happen.  If it does, ignore it
                    msg =  "Critical strange problem.  FWJR changed size while being processed."
                    logging.error(msg)
                else:
                    try:
                        os.remove(reportName)
                        condorReport.save(filename = reportName)
                    except Exception as ex:
                        logging.error("Cannot remove and replace empty report %s" % reportName)
                        logging.error("Report continuing without error!")
            else:
                condorReport.save(filename = reportName)

            # Debug message to end loop
            logging.debug("No returning job report for job %i" % job['id'])


        return
Ejemplo n.º 30
0
    def submit(self, jobs, info = None):
        """
        _submit_

        Submit jobs for one subscription

        """
        # If we're here, then we have submitter components
        self.scriptFile = self.config.JobSubmitter.submitScript
        self.queue = self.config.JobSubmitter.LsfPluginQueue
        self.resourceReq =  getattr(self.config.JobSubmitter, 'LsfPluginResourceReq', None)
        self.jobGroup = self.config.JobSubmitter.LsfPluginJobGroup
        self.batchOutput = getattr(self.config.JobSubmitter, 'LsfPluginBatchOutput', None)

        successfulJobs = []
        failedJobs     = []

        if len(jobs) == 0:
            # Then we have nothing to do
            return successfulJobs, failedJobs


        # Now assume that what we get is the following; a mostly
        # unordered list of jobs with random sandboxes.
        # We intend to sort them by sandbox.

        submitDict = {}
        for job in jobs:
            sandbox = job['sandbox']
            if not sandbox in submitDict.keys():
                submitDict[sandbox] = []
            submitDict[sandbox].append(job)


        # Now submit the bastards
        for sandbox in submitDict.keys():
            jobList = submitDict.get(sandbox, [])
            while len(jobList) > 0:
                jobsReady = jobList[:self.config.JobSubmitter.jobsPerWorker]
                jobList   = jobList[self.config.JobSubmitter.jobsPerWorker:]

                for job in jobsReady:

                    if job == {}:
                        # Then I don't know how we got here either
                        logging.error("Was passed a nonexistant job.  Ignoring")
                        continue

                    submitScript = self.makeSubmit(job)

                    if not submitScript:
                        # Then we got nothing
                        logging.error("No submit script made!")
                        return {'NoResult': [0]}

                    submitScriptFile = os.path.join(job['cache_dir'], "submit.sh")
                    handle = open(submitScriptFile, 'w')
                    handle.writelines(submitScript)
                    handle.close()

                    # make reasonable job name
                    jobName = "WMAgentJob"
                    regExpParser = re.compile('.*/JobCreator/JobCache/([^/]+)/[^/]+/.*')
                    match = regExpParser.match(job['cache_dir'])
                    if ( match != None ):
                        jobName = "%s-%s" % (match.group(1), job['id'])

                    # //
                    # // Submit LSF job
                    # //
                    command = 'bsub'
                    command += ' -q %s' % self.queue

                    if self.resourceReq != None:
                        command += ' -R "%s"' % self.resourceReq

                    command += ' -g %s' % self.jobGroup
                    command += ' -J %s' % jobName

                    lsfLogDir = self.batchOutput
                    if lsfLogDir != None:
                        now = datetime.datetime.today()
                        lsfLogDir += '/%s' % now.strftime("%Y%m%d%H")
                        try:
                            os.mkdir(lsfLogDir)
                            logging.debug("Created directory %s" % lsfLogDir)
                        except OSError, err:
                            # suppress LSF log unless it's about an already exisiting directory
                            if err.errno != errno.EEXIST or not os.path.isdir(lsfLogDir):
                                logging.error("Can't create directory %s, turning off LSF log" % lsfLogDir)
                                lsfLogDir = None

                    if lsfLogDir == None:
                        command += ' -oo /dev/null'
                    else:
                        command += ' -oo %s/%s.%%J.out' % (lsfLogDir, jobName)

                    command += ' < %s' % submitScriptFile

                    logging.info("Submitting LSF job: %s" % command)

                    p = subprocess.Popen(command, shell = True,
                                         stdout = subprocess.PIPE,
                                         stderr = subprocess.STDOUT)
                    
                    stdout = p.communicate()[0] 
                    returncode = p.returncode

                    if returncode == 0:
                        # check for correct naming convention in PFN
                        regExpParser = re.compile('Job <([0-9]+)> is submitted to queue')
                        match = regExpParser.match(stdout)
                        if match != None:
                            job['gridid'] = match.group(1)
                            successfulJobs.append(job)
                            logging.info("LSF Job ID : %s" % job['gridid'] )
                            continue
                        else:
                            logging.error("bsub didn't return a valid Job ID. Job is not submitted")
                            logging.error(stdout)    

                    lsfErrorReport = Report()
                    lsfErrorReport.addError("JobSubmit", 61202, "LsfError", stdout)
                    job['fwjr'] = lsfErrorReport
                    failedJobs.append(job)
Ejemplo n.º 31
0
    def submit(self, jobs, info = None):
        """
        _submit_

        Submit jobs for one subscription

        """
        # If we're here, then we have submitter components
        self.scriptFile = self.config.JobSubmitter.submitScript
        self.queue = self.config.JobSubmitter.LsfPluginQueue
        self.resourceReq =  getattr(self.config.JobSubmitter, 'LsfPluginResourceReq', None)
        self.jobGroup = self.config.JobSubmitter.LsfPluginJobGroup
        self.batchOutput = getattr(self.config.JobSubmitter, 'LsfPluginBatchOutput', None)

        successfulJobs = []
        failedJobs     = []

        if len(jobs) == 0:
            # Then we have nothing to do
            return successfulJobs, failedJobs


        # Now assume that what we get is the following; a mostly
        # unordered list of jobs with random sandboxes.
        # We intend to sort them by sandbox.

        submitDict = {}
        for job in jobs:
            sandbox = job['sandbox']
            if not sandbox in submitDict.keys():
                submitDict[sandbox] = []
            submitDict[sandbox].append(job)


        # Now submit the bastards
        for sandbox in submitDict.keys():
            jobList = submitDict.get(sandbox, [])
            while len(jobList) > 0:
                jobsReady = jobList[:self.config.JobSubmitter.jobsPerWorker]
                jobList   = jobList[self.config.JobSubmitter.jobsPerWorker:]

                for job in jobsReady:

                    if job == {}:
                        # Then I don't know how we got here either
                        logging.error("Was passed a nonexistant job.  Ignoring")
                        continue

                    submitScript = self.makeSubmit(job)

                    if not submitScript:
                        # Then we got nothing
                        logging.error("No submit script made!")
                        return {'NoResult': [0]}

                    submitScriptFile = os.path.join(job['cache_dir'], "submit.sh")
                    handle = open(submitScriptFile, 'w')
                    handle.writelines(submitScript)
                    handle.close()

                    # //
                    # // Submit LSF job
                    # //
                    command = 'bsub'
                    command += ' -q %s' % self.queue

                    if self.resourceReq != None:
                        command += ' -R "%s"' % self.resourceReq

                    command += ' -g %s' % self.jobGroup
                    command += ' -J %s' % "WMAgentJob"

                    if self.batchOutput == None:
                        command += ' -oo /dev/null'
                    else:
                        command += ' -oo %s' % self.batchOutput

                    command += ' < %s' % submitScriptFile

                    logging.info("Submitting LSF job: %s" % command)

                    p = subprocess.Popen(command, shell = True,
                                         stdout = subprocess.PIPE,
                                         stderr = subprocess.STDOUT)
                    stdout = p.communicate()[0]
                    returncode = p.returncode

                    if returncode == 0:
                        # check for correct naming convention in PFN
                        regExpParser = re.compile('Job <([0-9]+)> is submitted to queue')
                        match = regExpParser.match(stdout)
                        if match != None:
                            job['gridid'] = match.group(1)
                            successfulJobs.append(job)
                            continue

                    lsfErrorReport = Report()
                    lsfErrorReport.addError("JobSubmit", 61202, "LsfError", stdout)
                    job['fwjr'] = lsfErrorReport
                    failedJobs.append(job)
                    
        # We must return a list of jobs successfully submitted,
        # and a list of jobs failed
        return successfulJobs, failedJobs
Ejemplo n.º 32
0
    def complete(self, jobs):
        """
        Do any completion work required

        In this case, look for a returned logfile
        """

        for job in jobs:

            if job.get('cache_dir', None) is None or job.get('retry_count', None) is None:
                # Then we can't do anything
                logging.error("Can't find this job's cache_dir or retry count: %s", job)
                continue

            reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count'])
            if os.path.isfile(reportName) and os.path.getsize(reportName) > 0:
                # everything in order, move on
                continue
            elif os.path.isdir(reportName):
                # Then something weird has happened. Report error, do nothing
                logging.error("The job report for job with id %s and gridid %s is a directory", job['id'],
                              job['gridid'])
                logging.error("Ignoring this, but this is very strange")
            else:
                logging.error("No job report for job with id %s and gridid %s", job['id'], job['gridid'])

                if os.path.isfile(reportName):
                    os.remove(reportName)

                # create a report from scratch
                condorReport = Report()
                logOutput = 'Could not find jobReport\n'

                if os.path.isdir(job['cache_dir']):
                    condorErr = "condor.%s.err" % job['gridid']
                    condorOut = "condor.%s.out" % job['gridid']
                    condorLog = "condor.%s.log" % job['gridid']
                    exitCode = 99303
                    exitType = "NoJobReport"
                    for condorFile in [condorErr, condorOut, condorLog]:
                        condorFilePath = os.path.join(job['cache_dir'], condorFile)
                        logOutput += "\n========== %s ==========\n" % condorFile
                        if os.path.isfile(condorFilePath):
                            logTail = BasicAlgos.tail(condorFilePath, 50)
                            logOutput += 'Adding end of %s to error message:\n\n' % condorFile
                            logOutput += logTail
                            logOutput += '\n\n'

                            if condorFile == condorLog:
                                # for condor log, search for the information
                                for matchObj in getIterMatchObjectOnRegexp(condorFilePath, CONDOR_LOG_FILTER_REGEXP):
                                    condorReason = matchObj.group("Reason")
                                    if condorReason:
                                        logOutput += condorReason
                                        if "SYSTEM_PERIODIC_REMOVE" in condorReason or "via condor_rm" in condorReason:
                                            exitCode = 99400
                                            exitType = "RemovedByGLIDEIN"
                                        else:
                                            exitCode = 99401

                                    siteName = matchObj.group("Site")
                                    if siteName:
                                        condorReport.data.siteName = siteName
                                    else:
                                        condorReport.data.siteName = "NoReportedSite"
                            else:
                                for matchObj in getIterMatchObjectOnRegexp(condorFilePath, WMEXCEPTION_REGEXP):
                                    errMsg = matchObj.group('WMException')
                                    if errMsg:
                                        logOutput += "\n\n%s\n" % errMsg

                                    errMsg = matchObj.group('ERROR')
                                    if errMsg:
                                        logOutput += "\n\n%s\n" % errMsg

                    logOutput += '\n\n'
                    condorReport.addError(exitType, exitCode, exitType, logOutput)
                else:
                    msg = "Serious Error in Completing condor job with id %s!\n" % job['id']
                    msg += "Could not find jobCache directory %s\n" % job['cache_dir']
                    msg += "Creating a new cache_dir for failed job report\n"
                    logging.error(msg)
                    os.makedirs(job['cache_dir'])
                    condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput)

                condorReport.save(filename=reportName)

                logging.debug("Created failed job report for job with id %s and gridid %s", job['id'], job['gridid'])

        return
Ejemplo n.º 33
0
    def complete(self, jobs):
        """
        Do any completion work required

        In this case, look for a returned logfile
        """

        for job in jobs:

            if job.get('cache_dir', None) is None or job.get(
                    'retry_count', None) is None:
                # Then we can't do anything
                logging.error(
                    "Can't find this job's cache_dir or retry count: %s", job)
                continue

            reportName = os.path.join(job['cache_dir'],
                                      'Report.%i.pkl' % job['retry_count'])
            if os.path.isfile(reportName) and os.path.getsize(reportName) > 0:
                # everything in order, move on
                continue
            elif os.path.isdir(reportName):
                # Then something weird has happened. Report error, do nothing
                logging.error(
                    "The job report for job with id %s and gridid %s is a directory",
                    job['id'], job['gridid'])
                logging.error("Ignoring this, but this is very strange")
            else:
                logging.error("No job report for job with id %s and gridid %s",
                              job['id'], job['gridid'])

                if os.path.isfile(reportName):
                    os.remove(reportName)

                # create a report from scratch
                condorReport = Report()
                logOutput = 'Could not find jobReport\n'

                if os.path.isdir(job['cache_dir']):
                    condorOut = "condor.%s.out" % job['gridid']
                    condorErr = "condor.%s.err" % job['gridid']
                    condorLog = "condor.%s.log" % job['gridid']
                    for condorFile in [condorOut, condorErr, condorLog]:
                        condorFilePath = os.path.join(job['cache_dir'],
                                                      condorFile)
                        if os.path.isfile(condorFilePath):
                            logTail = BasicAlgos.tail(condorFilePath, 50)
                            logOutput += 'Adding end of %s to error message:\n' % condorFile
                            logOutput += '\n'.join(logTail)
                    condorReport.addError("NoJobReport", 99303, "NoJobReport",
                                          logOutput)
                else:
                    msg = "Serious Error in Completing condor job with id %s!\n" % job[
                        'id']
                    msg += "Could not find jobCache directory %s\n" % job[
                        'cache_dir']
                    msg += "Creating a new cache_dir for failed job report\n"
                    logging.error(msg)
                    os.makedirs(job['cache_dir'])
                    condorReport.addError("NoJobReport", 99304, "NoCacheDir",
                                          logOutput)

                condorReport.save(filename=reportName)

                logging.debug(
                    "Created failed job report for job with id %s and gridid %s",
                    job['id'], job['gridid'])

        return
Ejemplo n.º 34
0
    def submit(self, jobs, info):
        """
        _submit_


        Submit jobs for one subscription
        """

        # If we're here, then we have submitter components
        self.scriptFile = self.config.JobSubmitter.submitScript
        self.submitDir  = self.config.JobSubmitter.submitDir
        timeout         = getattr(self.config.JobSubmitter, 'getTimeout', 400)

        successfulJobs = []
        failedJobs     = []
        jdlFiles       = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        if len(self.pool) == 0:
            # Starting things up
            # This is obviously a submit API
            logging.info("Starting up CondorPlugin worker pool")
            self.input    = multiprocessing.Queue()
            self.result   = multiprocessing.Queue()
            for x in range(self.nProcess):
                p = multiprocessing.Process(target = submitWorker,
                                            args = (self.input, self.result, timeout))
                p.start()
                self.pool.append(p)

        if not os.path.exists(self.submitDir):
            os.makedirs(self.submitDir)


        # Now assume that what we get is the following; a mostly
        # unordered list of jobs with random sandboxes.
        # We intend to sort them by sandbox.

        submitDict = {}
        nSubmits   = 0
        for job in jobs:
            sandbox = job['sandbox']
            if not sandbox in submitDict.keys():
                submitDict[sandbox] = []
            submitDict[sandbox].append(job)


        # Now submit the bastards
        queueError = False
        for sandbox in submitDict.keys():
            jobList = submitDict.get(sandbox, [])
            idList = [x['jobid'] for x in jobList]
            if queueError:
                # If the queue has failed, then we must not process
                # any more jobs this cycle.
                continue
            while len(jobList) > 0:
                jobsReady = jobList[:self.config.JobSubmitter.jobsPerWorker]
                jobList   = jobList[self.config.JobSubmitter.jobsPerWorker:]
                idList    = [x['id'] for x in jobsReady]
                jdlList = self.makeSubmit(jobList = jobsReady)
                if not jdlList or jdlList == []:
                    # Then we got nothing
                    logging.error("No JDL file made!")
                    return {'NoResult': [0]}
                jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0])
                handle = open(jdlFile, 'w')
                handle.writelines(jdlList)
                handle.close()
                jdlFiles.append(jdlFile)

                # Now submit them
                logging.info("About to submit %i jobs" %(len(jobsReady)))
                if self.glexecPath:
                    command = 'CS=`which condor_submit`; '
                    if self.glexecWrapScript:
                        command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript
                    command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile
                    command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile
                    command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile
                    command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile
                    if self.glexecUnwrapScript:
                        command += '%s %s -- $CS %s' % (self.glexecPath, self.glexecUnwrapScript, jdlFile)
                    else:
                        command += '%s $CS %s' % (self.glexecPath, jdlFile)
                else:
                    command = "condor_submit %s" % jdlFile

                try:
                    self.input.put({'command': command, 'idList': idList})
                except AssertionError as ex:
                    msg =  "Critical error: input pipeline probably closed.\n"
                    msg += str(ex)
                    msg += "Error Procedure: Something critical has happened in the worker process\n"
                    msg += "We will now proceed to pull all useful data from the queue (if it exists)\n"
                    msg += "Then refresh the worker pool\n"
                    logging.error(msg)
                    queueError = True
                    break
                nSubmits += 1

        # Now we should have sent all jobs to be submitted
        # Going to do the rest of it now
        for n in range(nSubmits):
            try:
                res = self.result.get(block = True, timeout = timeout)
            except Queue.Empty:
                # If the queue was empty go to the next submit
                # Those jobs have vanished
                logging.error("Queue.Empty error received!")
                logging.error("This could indicate a critical condor error!")
                logging.error("However, no information of any use was obtained due to process failure.")
                logging.error("Either process failed, or process timed out after %s seconds." % timeout)
                queueError = True
                continue
            except AssertionError as ex:
                msg =  "Found Assertion error while retrieving output from worker process.\n"
                msg += str(ex)
                msg += "This indicates something critical happened to a worker process"
                msg += "We will recover what jobs we know were submitted, and resubmit the rest"
                msg += "Refreshing worker pool at end of loop"
                logging.error(msg)
                queueError = True
                continue

            try:
                output   = res['stdout']
                error    = res['stderr']
                idList   = res['idList']
                exitCode = res['exitCode']
            except KeyError as ex:
                msg =  "Error in finding key from result pipe\n"
                msg += "Something has gone crticially wrong in the worker\n"
                try:
                    msg += "Result: %s\n" % str(res)
                except:
                    pass
                msg += str(ex)
                logging.error(msg)
                queueError = True
                continue

            if not exitCode == 0:
                logging.error("Condor returned non-zero.  Printing out command stderr")
                logging.error(error)
                errorCheck, errorMsg = parseError(error = error)
                logging.error("Processing failed jobs and proceeding to the next jobs.")
                logging.error("Do not restart component.")
            else:
                errorCheck = None

            if errorCheck:
                self.errorCount += 1
                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg)
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            job['fwjr'] = condorErrorReport
                            failedJobs.append(job)
                            break
            else:
                if self.errorCount > 0:
                    self.errorCount -= 1
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            successfulJobs.append(job)
                            break

            # If we get a lot of errors in a row it's probably time to
            # report this to the operators.
            if self.errorCount > self.errorThreshold:
                try:
                    msg = "Exceeded errorThreshold while submitting to condor. Check condor status."
                    logging.error(msg)
                    logging.error("Reporting to Alert system and continuing to process jobs")
                    from WMCore.Alerts import API as alertAPI
                    preAlert, sender = alertAPI.setUpAlertsMessaging(self,
                                                                     compName = "BossAirCondorPlugin")
                    sendAlert = alertAPI.getSendAlert(sender = sender,
                                                      preAlert = preAlert)
                    sendAlert(6, msg = msg)
                    sender.unregister()
                    self.errorCount = 0
                except:
                    # There's nothing we can really do here
                    pass

        # Remove JDL files unless commanded otherwise
        if getattr(self.config.JobSubmitter, 'deleteJDLFiles', True):
            for f in jdlFiles:
                os.remove(f)

        # When we're finished, clean up the queue workers in order
        # to free up memory (in the midst of the process, the forked
        # memory space shouldn't be touched, so it should still be
        # shared, but after this point any action by the Submitter will
        # result in memory duplication).
        logging.info("Purging worker pool to clean up memory")
        self.close()


        # We must return a list of jobs successfully submitted,
        # and a list of jobs failed
        logging.info("Done submitting jobs for this cycle in CondorPlugin")
        return successfulJobs, failedJobs
Ejemplo n.º 35
0
    def submit(self, jobs, info=None):
        """
        _submit_


        Submit jobs for one subscription
        """

        # If we're here, then we have submitter components
        self.scriptFile = self.config.JobSubmitter.submitScript
        self.submitDir  = self.config.JobSubmitter.submitDir
        timeout         = getattr(self.config.JobSubmitter, 'getTimeout', 400)

        successfulJobs = []
        failedJobs     = []
        jdlFiles       = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        if len(self.pool) == 0:
            # Starting things up
            # This is obviously a submit API
            logging.info("Starting up CondorPlugin worker pool")
            self.input    = multiprocessing.Queue()
            self.result   = multiprocessing.Queue()
            for x in range(self.nProcess):
                p = multiprocessing.Process(target = submitWorker,
                                            args = (self.input, self.result, timeout))
                p.start()
                self.pool.append(p)

        if not os.path.exists(self.submitDir):
            os.makedirs(self.submitDir)


        # Now assume that what we get is the following; a mostly
        # unordered list of jobs with random sandboxes.
        # We intend to sort them by sandbox.

        submitDict = {}
        nSubmits   = 0
        for job in jobs:
            sandbox = job['sandbox']
            if not sandbox in submitDict.keys():
                submitDict[sandbox] = []
            submitDict[sandbox].append(job)


        # Now submit the bastards
        queueError = False
        for sandbox in submitDict.keys():
            jobList = submitDict.get(sandbox, [])
            idList = [x['jobid'] for x in jobList]
            if queueError:
                # If the queue has failed, then we must not process
                # any more jobs this cycle.
                continue
            while len(jobList) > 0:
                jobsReady = jobList[:self.config.JobSubmitter.jobsPerWorker]
                jobList   = jobList[self.config.JobSubmitter.jobsPerWorker:]
                idList    = [x['id'] for x in jobsReady]
                jdlList = self.makeSubmit(jobList = jobsReady)
                if not jdlList or jdlList == []:
                    # Then we got nothing
                    logging.error("No JDL file made!")
                    return {'NoResult': [0]}
                jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0])
                handle = open(jdlFile, 'w')
                handle.writelines(jdlList)
                handle.close()
                jdlFiles.append(jdlFile)

                # Now submit them
                logging.info("About to submit %i jobs" %(len(jobsReady)))
                if self.glexecPath:
                    command = 'CS=`which condor_submit`; '
                    if self.glexecWrapScript:
                        command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript
                    command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile
                    command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile
                    command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile
                    command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile
                    if self.glexecUnwrapScript:
                        command += '%s %s -- $CS %s' % (self.glexecPath, self.glexecUnwrapScript, jdlFile)
                    else:
                        command += '%s $CS %s' % (self.glexecPath, jdlFile)
                else:
                    command = "condor_submit %s" % jdlFile

                try:
                    self.input.put({'command': command, 'idList': idList})
                except AssertionError as ex:
                    msg =  "Critical error: input pipeline probably closed.\n"
                    msg += str(ex)
                    msg += "Error Procedure: Something critical has happened in the worker process\n"
                    msg += "We will now proceed to pull all useful data from the queue (if it exists)\n"
                    msg += "Then refresh the worker pool\n"
                    logging.error(msg)
                    queueError = True
                    break
                nSubmits += 1

        # Now we should have sent all jobs to be submitted
        # Going to do the rest of it now
        for n in range(nSubmits):
            try:
                res = self.result.get(block = True, timeout = timeout)
            except Queue.Empty:
                # If the queue was empty go to the next submit
                # Those jobs have vanished
                logging.error("Queue.Empty error received!")
                logging.error("This could indicate a critical condor error!")
                logging.error("However, no information of any use was obtained due to process failure.")
                logging.error("Either process failed, or process timed out after %s seconds." % timeout)
                queueError = True
                continue
            except AssertionError as ex:
                msg =  "Found Assertion error while retrieving output from worker process.\n"
                msg += str(ex)
                msg += "This indicates something critical happened to a worker process"
                msg += "We will recover what jobs we know were submitted, and resubmit the rest"
                msg += "Refreshing worker pool at end of loop"
                logging.error(msg)
                queueError = True
                continue

            try:
                output   = res['stdout']
                error    = res['stderr']
                idList   = res['idList']
                exitCode = res['exitCode']
            except KeyError as ex:
                msg =  "Error in finding key from result pipe\n"
                msg += "Something has gone critically wrong in the worker\n"
                try:
                    msg += "Result: %s\n" % str(res)
                except:
                    pass
                msg += str(ex)
                logging.error(msg)
                queueError = True
                continue

            if not exitCode == 0:
                logging.error("Condor returned non-zero.  Printing out command stderr")
                logging.error(error)
                errorCheck, errorMsg = parseError(error = error)
                logging.error("Processing failed jobs and proceeding to the next jobs.")
                logging.error("Do not restart component.")
            else:
                errorCheck = None

            if errorCheck:
                self.errorCount += 1
                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg)
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            job['fwjr'] = condorErrorReport
                            failedJobs.append(job)
                            break
            else:
                if self.errorCount > 0:
                    self.errorCount -= 1
                for jobID in idList:
                    for job in jobs:
                        if job.get('id', None) == jobID:
                            successfulJobs.append(job)
                            break

            # If we get a lot of errors in a row it's probably time to
            # report this to the operators.
            if self.errorCount > self.errorThreshold:
                try:
                    msg = "Exceeded errorThreshold while submitting to condor. Check condor status."
                    logging.error(msg)
                    logging.error("Reporting to Alert system and continuing to process jobs")
                    from WMCore.Alerts import API as alertAPI
                    preAlert, sender = alertAPI.setUpAlertsMessaging(self,
                                                                     compName = "BossAirCondorPlugin")
                    sendAlert = alertAPI.getSendAlert(sender = sender,
                                                      preAlert = preAlert)
                    sendAlert(6, msg = msg)
                    sender.unregister()
                    self.errorCount = 0
                except:
                    # There's nothing we can really do here
                    pass

        # Remove JDL files unless commanded otherwise
        if getattr(self.config.JobSubmitter, 'deleteJDLFiles', True):
            for f in jdlFiles:
                os.remove(f)

        # When we're finished, clean up the queue workers in order
        # to free up memory (in the midst of the process, the forked
        # memory space shouldn't be touched, so it should still be
        # shared, but after this point any action by the Submitter will
        # result in memory duplication).
        logging.info("Purging worker pool to clean up memory")
        self.close()


        # We must return a list of jobs successfully submitted,
        # and a list of jobs failed
        logging.info("Done submitting jobs for this cycle in CondorPlugin")
        return successfulJobs, failedJobs
Ejemplo n.º 36
0
    def kill(self, jobs, workflowName=None, killMsg=None, errorCode=71300):
        """
        _kill_

        Kill jobs using plugin functions:

        Only active jobs (status = 1) will be killed. If workflowName is given,
        then kill all its jobs in one shot.
        An optional killMsg can be sent; this will be written into the job FWJR.
        The errorCode will be the one specified and if no killMsg is provided then
        a standard message associated with the exit code will be used.
        If a previous FWJR exists, this error will be appended to it.
        """
        if not jobs:
            return
        jobsToKill = {}

        # Now get a list of which jobs are in the batch system
        # only kill jobs present there
        loadedJobs = self._buildRunningJobs(wmbsJobs=jobs)

        for runningJob in loadedJobs:
            plugin = runningJob['plugin']
            jobsToKill.setdefault(plugin, [])
            jobsToKill[plugin].append(runningJob)

        for plugin in jobsToKill.keys():
            if plugin not in self.plugins.keys():
                msg = "Jobs tracking with non-existant plugin %s\n" % (plugin)
                msg += "They were submitted but can't be tracked?\n"
                msg += "That's too strange to continue\n"
                logging.error(msg)
                raise BossAirException(msg)
            else:
                # Then we send them to the plugins
                try:
                    pluginInst = self.plugins[plugin]
                    if workflowName:
                        # jobs are completed regardless whether the kill succeeded or not
                        self._completeKill(jobs=jobsToKill[plugin])
                        pluginInst.killWorkflowJobs(workflow=workflowName)
                    else:
                        # raise an exception if it fails to kill jobs, such that the same
                        # jobs are retried again in the next cycle
                        pluginInst.kill(jobs=jobsToKill[plugin], raiseEx=True)
                        self._completeKill(jobs=jobsToKill[plugin])

                    # Register the killed jobs
                    for job in jobsToKill[plugin]:
                        if job.get('cache_dir') is None or job.get('retry_count') is None:
                            continue
                        # Try to save an error report as the jobFWJR
                        if not os.path.isdir(job['cache_dir']):
                            # Then we have a bad cache directory
                            logging.error("Could not write a kill FWJR due to non-existant cache_dir for job %i\n", job['id'])
                            logging.debug("cache_dir: %s\n", job['cache_dir'])
                            continue
                        reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count'])
                        errorReport = Report()
                        if os.path.exists(reportName) and os.path.getsize(reportName) > 0:
                            # Then there's already a report there.  Add messages
                            errorReport.load(reportName)
                        # Build a better job message
                        if killMsg:
                            reportedMsg = killMsg
                        else:
                            reportedMsg = WM_JOB_ERROR_CODES[errorCode]
                            reportedMsg += '\n Job last known status was: %s' % job.get('globalState', 'Unknown')
                        errorReport.addError("JobKilled", errorCode, "JobKilled", reportedMsg)
                        try:
                            errorReport.save(filename=reportName)
                        except IOError as ioe:
                            logging.warning('Cannot write report %s because of %s', reportName, ioe)
                except RuntimeError:
                    logging.warning("Plugin failed to remove jobs. It will be retried in the next cycle.")
                except WMException:
                    raise
                except Exception as ex:
                    msg = "Unhandled exception while calling kill method for plugin %s\n" % plugin
                    msg += str(ex)
                    logging.error(msg)
                    logging.debug("Interrupted while killing following jobs: %s\n", jobsToKill[plugin])
                    raise BossAirException(msg)
        return
Ejemplo n.º 37
0
    def testExitCode(self):
        """
        _testExitCode_

        Test and see if we can get an exit code out of a report

        Note: Errors without a return code return 99999
        getStepExitCode: returns the first valid and non-zero exit code
        getExitCode: uses the method above to get an exit code
        getStepExitCodes: returns a set of all exit codes within the step
        """

        report = Report("cmsRun1")
        self.assertEqual(report.getExitCode(), 0)
        self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 0)
        self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {})
        self.assertItemsEqual(report.getStepErrors(stepName="cmsRun1"), {})

        report.addError(stepName="cmsRun1", exitCode=None, errorType="test", errorDetails="test")
        # None is not a valid exitCode, but it will get mapped to 99999
        self.assertEqual(report.getExitCode(), 99999)
        self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 99999)
        self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999})
        self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 1)

        report.addError(stepName="cmsRun1", exitCode=102, errorType="test", errorDetails="test")
        self.assertEqual(report.getExitCode(), 102)
        self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102)
        self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102})
        self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 2)

        report.addError(stepName="cmsRun1", exitCode=103, errorType="test", errorDetails="test")
        self.assertEqual(report.getExitCode(), 102)
        self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102)
        self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102, 103})
        self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 3)

        # now try to record the same exit code once again
        report.addError(stepName="cmsRun1", exitCode=104, errorType="test", errorDetails="test")
        self.assertEqual(report.getExitCode(), 102)
        self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102)
        self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102, 103, 104})
        self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 4)

        # and once again, but different type and details (which does not matter)
        report.addError(stepName="cmsRun1", exitCode=105, errorType="testEE", errorDetails="testAA")
        self.assertEqual(report.getExitCode(), 102)
        self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102)
        self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102, 103, 104, 105})
        self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 5)

        # and once again, but different type and details - testing unicode handling
        report.addError(stepName="cmsRun1", exitCode=106, errorType="test", errorDetails="1 тℯṧт")
        self.assertEqual(report.getExitCode(), 102)
        self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102)
        self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102, 103, 104, 105, 106})
        self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 6)

        # and once again, but different type and details - testing unicode handling
        report.addError(stepName="cmsRun1", exitCode=107, errorType="test", errorDetails="2 тℯṧт \x95")
        self.assertEqual(report.getExitCode(), 102)
        self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102)
        self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102, 103, 104, 105, 106, 107})
        self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 7)

        # and once again, but different type and details - testing unicode handling
        report.addError(stepName="cmsRun1", exitCode=108, errorType="test", errorDetails=encodeUnicodeToBytes("3 тℯṧт"))
        self.assertEqual(report.getExitCode(), 102)
        self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102)
        self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102, 103, 104, 105, 106, 107, 108})
        self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 8)

        # and once again, but different type and details - testing unicode handling
        report.addError(stepName="cmsRun1", exitCode=109, errorType="test", errorDetails=decodeBytesToUnicode("4 тℯṧт"))
        self.assertEqual(report.getExitCode(), 102)
        self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102)
        self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102, 103, 104, 105, 106, 107, 108, 109})
        self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 9)

        # and once again, but different type and details - testing unicode handling
        report.addError(stepName="cmsRun1", exitCode=110, errorType="test", errorDetails={"нεʟʟ◎": 3.14159})
        self.assertEqual(report.getExitCode(), 102)
        self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102)
        self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102, 103, 104, 105, 106, 107, 108, 109, 110})
        self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 10)

        # and once again, but different type and details - testing unicode handling
        report.addError(stepName="cmsRun1", exitCode=111, errorType="test", errorDetails={"нεʟʟ◎ \x95": "ẘøґℓ∂ \x95"})
        self.assertEqual(report.getExitCode(), 102)
        self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102)
        self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111})
        self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 11)
Ejemplo n.º 38
0
    def execute(self, emulator = None):
        """
        _execute_


        """
        #Are we using emulators again?
        if (emulator != None):
            return emulator.emulate( self.step, self.job )


        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to over an hour
        waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))

        logging.info("StageOut override is: %s " % self.step)

        # Pull out StageOutMgr Overrides

        # switch between old stageOut behavior and new, fancy stage out behavior
        useNewStageOutCode = False
        if getattr(self.step, 'newStageout', False) or \
            ('newStageOut' in overrides and overrides.get('newStageOut')):
            useNewStageOutCode = True


        stageOutCall = {}
        if "command" in overrides and "option" in overrides \
               and "se-name" in overrides and "phedex-node" in overrides \
               and"lfn-prefix" in overrides:
            logging.critical('using override in StageOut')
            stageOutCall['command']    = overrides.get('command')
            stageOutCall['option']     = overrides.get('option')
            stageOutCall['se-name']    = overrides.get('se-name')
            stageOutCall['phedex-node']= overrides.get('phedex-node')
            stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix')

        # naw man, this is real
        # iterate over all the incoming files
        if not useNewStageOutCode:
            # old style
            manager = StageOutMgr.StageOutMgr(**stageOutCall)
            manager.numberOfRetries = self.step.retryCount
            manager.retryPauseTime  = self.step.retryDelay
        else:
            # new style
            logging.critical("STAGEOUT IS USING NEW STAGEOUT CODE")
            print "STAGEOUT IS USING NEW STAGEOUT CODE"
            manager = WMCore.Storage.FileManager.StageOutMgr(
                                retryPauseTime  = self.step.retryDelay,
                                numberOfRetries = self.step.retryCount,
                                **stageOutCall)

        # We need to find a list of steps in our task
        # And eventually a list of jobReports for out steps

        # Search through steps for report files
        filesTransferred = []

        for step in self.stepSpace.taskSpace.stepSpaces():
            if step == self.stepName:
                #Don't try to parse your own report; it's not there yet
                continue
            stepLocation = os.path.join(self.stepSpace.taskSpace.location, step)
            logging.info("Beginning report processing for step %s" % (step))
            reportLocation = os.path.join(stepLocation, 'Report.pkl')
            if not os.path.isfile(reportLocation):
                logging.error("Cannot find report for step %s in space %s" \
                              % (step, stepLocation))
                continue
            # First, get everything from a file and 'unpersist' it
            stepReport = Report()
            stepReport.unpersist(reportLocation, step)
            taskID = getattr(stepReport.data, 'id', None)

            # Don't stage out files from bad steps.
            if not stepReport.stepSuccessful(step):
                continue

            # Okay, time to start using stuff
            # Now I'm a bit confused about this; each report should ONLY
            # Have the results of that particular step in it,
            # So getting all the files should get ONLY the files
            # for that step; or so I hope
            files = stepReport.getAllFileRefsFromStep(step = step)
            for file in files:
                if not hasattr(file, 'lfn') and hasattr(file, 'pfn'):
                    # Then we're truly hosed on this file; ignore it
                    msg = "Not a file: %s" % file
                    logging.error(msg)
                    continue
                # Support direct-to-merge
                # This requires pulling a bunch of stuff from everywhere
                # First check if it's needed
                if hasattr(self.step.output, 'minMergeSize') \
                       and hasattr(file, 'size') \
                       and not getattr(file, 'merged', False):

                    # We need both of those to continue, and we don't
                    # direct-to-merge
                    if getattr(self.step.output, 'doNotDirectMerge', False):
                        # Then we've been told explicitly not to do direct-to-merge
                        continue
                    if file.size >= self.step.output.minMergeSize:
                        # Then this goes direct to merge
                        try:
                            file = self.handleLFNForMerge(mergefile = file, step = step)
                        except Exception as ex:
                            logging.error("Encountered error while handling LFN for merge due to size.\n")
                            logging.error(str(ex))
                            logging.debug(file)
                            logging.debug("minMergeSize: %s" % self.step.output.minMergeSize)
                            manager.cleanSuccessfulStageOuts()
                            stepReport.addError(self.stepName, 60401,
                                                "DirectToMergeFailure", str(ex))
                    elif getattr(self.step.output, 'maxMergeEvents', None) != None\
                             and getattr(file, 'events', None) != None\
                             and not getattr(file, 'merged', False):
                        # Then direct-to-merge due to events if
                        # the file is large enough:
                        if file.events >= self.step.output.maxMergeEvents:
                            # straight to merge
                            try:
                                file = self.handleLFNForMerge(mergefile = file, step = step)
                            except Exception as ex:
                                logging.error("Encountered error while handling LFN for merge due to events.\n")
                                logging.error(str(ex))
                                logging.debug(file)
                                logging.debug("maxMergeEvents: %s" % self.step.output.maxMergeEvents)
                                manager.cleanSuccessfulStageOuts()
                                stepReport.addError(self.stepName, 60402,
                                                    "DirectToMergeFailure", str(ex))

                # Save the input PFN in case we need it
                # Undecided whether to move file.pfn to the output PFN
                file.InputPFN   = file.pfn
                lfn = getattr(file, 'lfn')
                fileSource = getattr(file, 'Source', None)
                if fileSource in ['TFileService', 'UserDefined']:
                    userLfnRegEx(lfn)
                else:
                    lfnRegEx(lfn)
                fileForTransfer = {'LFN': lfn,
                                   'PFN': getattr(file, 'pfn'),
                                   'SEName' : None,
                                   'PNN' : None,
                                   'StageOutCommand': None,
                                   'Checksums' : getattr(file, 'checksums', None)}
                signal.signal(signal.SIGALRM, alarmHandler)
                signal.alarm(waitTime)
                try:
                    manager(fileForTransfer)
                    #Afterwards, the file should have updated info.
                    filesTransferred.append(fileForTransfer)
                    file.StageOutCommand = fileForTransfer['StageOutCommand']
#                    file.location        = fileForTransfer['SEName']
                    file.location        = fileForTransfer['PNN']
                    file.OutputPFN       = fileForTransfer['PFN']
                except Alarm:
                    msg = "Indefinite hang during stageOut of logArchive"
                    logging.error(msg)
                    manager.cleanSuccessfulStageOuts()
                    stepReport.addError(self.stepName, 60403,
                                        "StageOutTimeout", msg)
                    stepReport.persist("Report.pkl")
                except Exception as ex:
                    manager.cleanSuccessfulStageOuts()
                    stepReport.addError(self.stepName, 60307,
                                        "StageOutFailure", str(ex))
                    stepReport.setStepStatus(self.stepName, 1)
                    stepReport.persist("Report.pkl")
                    raise

                signal.alarm(0)



            # Am DONE with report
            # Persist it
            stepReport.persist(reportLocation)



        #Done with all steps, and should have a list of
        #stagedOut files in fileForTransfer
        logging.info("Transferred %i files" %(len(filesTransferred)))
        return
Ejemplo n.º 39
0
                # Try to save an error report as the jobFWJR
                if not os.path.isdir(job['cache_dir']):
                    # Then we have a bad cache directory
                    logging.error("Could not write a kill FWJR due to non-existant cache_dir for job %i\n" % job['id'])
                    logging.debug("cache_dir: %s\n" % job['cache_dir'])
                    continue
                reportName = os.path.join(job['cache_dir'],
                                              'Report.%i.pkl' % job['retry_count'])
                if os.path.exists(reportName) and os.path.getsize(reportName) > 0:
                    # Then there's already a report there.  Ignore this.
                    logging.debug("Not writing report due to pre-existing report for job %i.\n" % job['id'])
                    logging.debug("ReportPath: %s\n" % reportName)
                    continue
                else:
                    condorErrorReport = Report()
                    condorErrorReport.addError("JobKilled", 61302, "JobKilled", killMsg)
                    condorErrorReport.save(filename = reportName)

        return



    def update(self, jobs):
        """
        _update_

        Overwrite the database with whatever you put into
        this function.
        """

        runJobs = self._buildRunningJobs(wmbsJobs = jobs)
Ejemplo n.º 40
0
    def execute(self, emulator = None):
        """
        _execute_


        """
        #Are we using emulators again?
        if (emulator != None):
            return emulator.emulate( self.step, self.job )


        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to over an hour
        waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))

        logging.info("StageOut override is: %s " % self.step)

        # Pull out StageOutMgr Overrides

        # switch between old stageOut behavior and new, fancy stage out behavior
        useNewStageOutCode = False
        if overrides.has_key('newStageOut') and overrides.get('newStageOut'):
            useNewStageOutCode = True


        stageOutCall = {}
        if overrides.has_key("command") and overrides.has_key("option") \
               and overrides.has_key("se-name") and overrides.has_key("lfn-prefix"):
            logging.critical('using override in StageOut')
            stageOutCall['command']    = overrides.get('command')
            stageOutCall['option']     = overrides.get('option')
            stageOutCall['se-name']    = overrides.get('se-name')
            stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix')

        # naw man, this is real
        # iterate over all the incoming files
        if not useNewStageOutCode:
            # old style
            manager = StageOutMgr.StageOutMgr(**stageOutCall)
            manager.numberOfRetries = self.step.retryCount
            manager.retryPauseTime  = self.step.retryDelay
        else:
            # new style
            logging.critical("STAGEOUT IS USING NEW STAGEOUT CODE")
            print "STAGEOUT IS USING NEW STAGEOUT CODE"
            manager = WMCore.Storage.FileManager.StageOutMgr(
                                retryPauseTime  = self.step.retryDelay,
                                numberOfRetries = self.step.retryCount,
                                **stageOutCall)

        # We need to find a list of steps in our task
        # And eventually a list of jobReports for out steps

        # Search through steps for report files
        filesTransferred = []

        for step in self.stepSpace.taskSpace.stepSpaces():
            if step == self.stepName:
                #Don't try to parse your own report; it's not there yet
                continue
            stepLocation = os.path.join(self.stepSpace.taskSpace.location, step)
            logging.info("Beginning report processing for step %s" % (step))
            reportLocation = os.path.join(stepLocation, 'Report.pkl')
            if not os.path.isfile(reportLocation):
                logging.error("Cannot find report for step %s in space %s" \
                              % (step, stepLocation))
                continue
            # First, get everything from a file and 'unpersist' it
            stepReport = Report()
            stepReport.unpersist(reportLocation, step)
            taskID = getattr(stepReport.data, 'id', None)

            # Don't stage out files from bad steps.
            if not stepReport.stepSuccessful(step):
                continue

            # Okay, time to start using stuff
            # Now I'm a bit confused about this; each report should ONLY
            # Have the results of that particular step in it,
            # So getting all the files should get ONLY the files
            # for that step; or so I hope
            files = stepReport.getAllFileRefsFromStep(step = step)
            for file in files:
                if not hasattr(file, 'lfn') and hasattr(file, 'pfn'):
                    # Then we're truly hosed on this file; ignore it
                    msg = "Not a file: %s" % file
                    logging.error(msg)
                    continue
                # Support direct-to-merge
                # This requires pulling a bunch of stuff from everywhere
                # First check if it's needed
                if hasattr(self.step.output, 'minMergeSize') \
                       and hasattr(file, 'size') \
                       and not getattr(file, 'merged', False):

                    # We need both of those to continue, and we don't
                    # direct-to-merge
                    if getattr(self.step.output, 'doNotDirectMerge', False):
                        # Then we've been told explicitly not to do direct-to-merge
                        continue
                    if file.size >= self.step.output.minMergeSize:
                        # Then this goes direct to merge
                        try:
                            file = self.handleLFNForMerge(mergefile = file, step = step)
                        except Exception, ex:
                            logging.error("Encountered error while handling LFN for merge due to size.\n")
                            logging.error(str(ex))
                            logging.debug(file)
                            logging.debug("minMergeSize: %s" % self.step.output.minMergeSize)
                            manager.cleanSuccessfulStageOuts()
                            stepReport.addError(self.stepName, 60401,
                                                "DirectToMergeFailure", str(ex))
                    elif getattr(self.step.output, 'maxMergeEvents', None) != None\
                             and getattr(file, 'events', None) != None\
                             and not getattr(file, 'merged', False):
                        # Then direct-to-merge due to events if
                        # the file is large enough:
                        if file.events >= self.step.output.maxMergeEvents:
                            # straight to merge
                            try:
                                file = self.handleLFNForMerge(mergefile = file, step = step)
                            except Exception, ex:
                                logging.error("Encountered error while handling LFN for merge due to events.\n")
                                logging.error(str(ex))
                                logging.debug(file)
                                logging.debug("maxMergeEvents: %s" % self.step.output.maxMergeEvents)
                                manager.cleanSuccessfulStageOuts()
                                stepReport.addError(self.stepName, 60402,
                                                    "DirectToMergeFailure", str(ex))

                # Save the input PFN in case we need it
                # Undecided whether to move file.pfn to the output PFN
                file.InputPFN   = file.pfn
                lfn = getattr(file, 'lfn')
                fileSource = getattr(file, 'Source', None)
                if fileSource in ['TFileService', 'UserDefined']:
                    userLfnRegEx(lfn)
                else:
                    lfnRegEx(lfn)
                fileForTransfer = {'LFN': lfn,
                                   'PFN': getattr(file, 'pfn'),
                                   'SEName' : None,
                                   'StageOutCommand': None}
                signal.signal(signal.SIGALRM, alarmHandler)
                signal.alarm(waitTime)
                try:
                    manager(fileForTransfer)
                    #Afterwards, the file should have updated info.
                    filesTransferred.append(fileForTransfer)
                    file.StageOutCommand = fileForTransfer['StageOutCommand']
                    file.location        = fileForTransfer['SEName']
                    file.OutputPFN       = fileForTransfer['PFN']
                except Alarm:
                    msg = "Indefinite hang during stageOut of logArchive"
                    logging.error(msg)
                    manager.cleanSuccessfulStageOuts()
                    stepReport.addError(self.stepName, 60403,
                                        "StageOutTimeout", msg)
                    stepReport.persist("Report.pkl")
                except Exception, ex:
                    manager.cleanSuccessfulStageOuts()
                    stepReport.addError(self.stepName, 60307,
                                        "StageOutFailure", str(ex))
                    stepReport.setStepStatus(self.stepName, 1)
                    stepReport.persist("Report.pkl")
                    raise
Ejemplo n.º 41
0
    def kill(self, jobs, workflowName=None, killMsg=None, errorCode=71300):
        """
        _kill_

        Kill jobs using plugin functions:

        Only active jobs (status = 1) will be killed. If workflowName is given,
        then kill all its jobs in one shot.
        An optional killMsg can be sent; this will be written into the job FWJR.
        The errorCode will be the one specified and if no killMsg is provided then
        a standard message associated with the exit code will be used.
        If a previous FWJR exists, this error will be appended to it.
        """
        if not jobs:
            return
        jobsToKill = {}

        # Now get a list of which jobs are in the batch system
        # only kill jobs present there
        loadedJobs = self._buildRunningJobs(wmbsJobs=jobs)

        for runningJob in loadedJobs:
            plugin = runningJob['plugin']
            jobsToKill.setdefault(plugin, [])
            jobsToKill[plugin].append(runningJob)

        for plugin in jobsToKill.keys():
            if plugin not in self.plugins.keys():
                msg = "Jobs tracking with non-existant plugin %s\n" % (plugin)
                msg += "They were submitted but can't be tracked?\n"
                msg += "That's too strange to continue\n"
                logging.error(msg)
                raise BossAirException(msg)
            else:
                # Then we send them to the plugins
                try:
                    pluginInst = self.plugins[plugin]
                    if workflowName:
                        pluginInst.killWorkflowJobs(workflow=workflowName)
                    else:
                        pluginInst.kill(jobs=jobsToKill[plugin])
                    # Register the killed jobs
                    for job in jobsToKill[plugin]:
                        if job.get('cache_dir') is None or job.get(
                                'retry_count') is None:
                            continue
                        # Try to save an error report as the jobFWJR
                        if not os.path.isdir(job['cache_dir']):
                            # Then we have a bad cache directory
                            logging.error(
                                "Could not write a kill FWJR due to non-existant cache_dir for job %i\n",
                                job['id'])
                            logging.debug("cache_dir: %s\n", job['cache_dir'])
                            continue
                        reportName = os.path.join(
                            job['cache_dir'],
                            'Report.%i.pkl' % job['retry_count'])
                        errorReport = Report()
                        if os.path.exists(reportName) and os.path.getsize(
                                reportName) > 0:
                            # Then there's already a report there.  Add messages
                            errorReport.load(reportName)
                        # Build a better job message
                        if killMsg:
                            reportedMsg = killMsg
                        else:
                            reportedMsg = WM_JOB_ERROR_CODES[errorCode]
                            reportedMsg += '\n Job last known status was: %s' % job.get(
                                'globalState', 'Unknown')
                        errorReport.addError("JobKilled", errorCode,
                                             "JobKilled", reportedMsg)
                        try:
                            errorReport.save(filename=reportName)
                        except IOError as ioe:
                            logging.warning(
                                'Cannot write report %s because of %s',
                                reportName, ioe)
                except WMException:
                    raise
                except Exception as ex:
                    msg = "Unhandled exception while calling kill method for plugin %s\n" % plugin
                    msg += str(ex)
                    logging.error(msg)
                    logging.debug(
                        "Interrupted while killing following jobs: %s\n",
                        jobsToKill[plugin])
                    raise BossAirException(msg)
                finally:
                    # Even if kill fails, complete the jobs
                    self._completeKill(jobs=jobsToKill[plugin])
        return
Ejemplo n.º 42
0
                    logging.error(
                        "Could not write a kill FWJR due to non-existant cache_dir for job %i\n"
                        % job['id'])
                    logging.debug("cache_dir: %s\n" % job['cache_dir'])
                    continue
                reportName = os.path.join(job['cache_dir'],
                                          'Report.%i.pkl' % job['retry_count'])
                condorErrorReport = Report()
                if os.path.exists(
                        reportName) and os.path.getsize(reportName) > 0:
                    # Then there's already a report there.  Add messages
                    condorErrorReport.load(reportName)
                #Build a better job message
                reportedMsg = killMsg + '\n Job last known status was: %s' % job.get(
                    'globalState', 'Unknown')
                condorErrorReport.addError("JobKilled", 61302, "JobKilled",
                                           reportedMsg)
                try:
                    condorErrorReport.save(filename=reportName)
                except IOError, ioe:
                    logging.warning('Cannot write report %s because of %s' %
                                    (reportName, ioe))

        return

    def update(self, jobs):
        """
        _update_

        Overwrite the database with whatever you put into
        this function.
        """
Ejemplo n.º 43
0
                    continue
                # Try to save an error report as the jobFWJR
                if not os.path.isdir(job['cache_dir']):
                    # Then we have a bad cache directory
                    logging.error("Could not write a kill FWJR due to non-existant cache_dir for job %i\n" % job['id'])
                    logging.debug("cache_dir: %s\n" % job['cache_dir'])
                    continue
                reportName = os.path.join(job['cache_dir'],
                                              'Report.%i.pkl' % job['retry_count'])
                condorErrorReport = Report()
                if os.path.exists(reportName) and os.path.getsize(reportName) > 0:
                    # Then there's already a report there.  Add messages
                    condorErrorReport.load(reportName)
                #Build a better job message
                reportedMsg = killMsg + '\n Job last known status was: %s' % job.get('globalState', 'Unknown')
                condorErrorReport.addError("JobKilled", 61302, "JobKilled", reportedMsg)
                condorErrorReport.save(filename = reportName)

        return



    def update(self, jobs):
        """
        _update_

        Overwrite the database with whatever you put into
        this function.
        """

        runJobs = self._buildRunningJobs(wmbsJobs = jobs)