Esempio n. 1
0
    def testA_runCommand(self):
        """
        _runCommand_

        Check and see we can run a basic shell command.
        Also check to make sure we get the exit code back correctly
        """

        # First ls a directory you know will have stuff in it
        stdout, stderr, retcode = SubprocessAlgos.runCommand(cmd='ls /tmp/')
        self.assertTrue(len(stdout) > 0)
        self.assertEqual(retcode, 0)

        # Now try this same with the shell off
        stdout, stderr, retcode = SubprocessAlgos.runCommand(
            cmd=['ls', '/tmp/'])
        self.assertTrue(len(stdout) > 0)
        self.assertEqual(retcode, 0)

        # Now test and see if we can catch a non-zero return code and output
        stdout, stderr, retcode = SubprocessAlgos.runCommand(
            cmd='echo HELP; exit 5')
        self.assertEqual(stdout, 'HELP\n')
        self.assertEqual(stderr, '')
        self.assertEqual(retcode, 5)

        # Now see if we can do a timeout if the process takes too long
        self.assertRaises(SubprocessAlgos.SubprocessAlgoException,
                          SubprocessAlgos.runCommand,
                          cmd='sleep 10',
                          timeout=1)

        # And this one will go on for too long, but not raise because waitTime is not an int
        SubprocessAlgos.runCommand(cmd='sleep 1', timeout=0.1)
        return
    def testA_runCommand(self):
        """
        _runCommand_

        Check and see we can run a basic shell command.
        Also check to make sure we get the exit code back correctly
        """

        # First ls a directory you know will have stuff in it
        stdout, stderr, retcode = SubprocessAlgos.runCommand(cmd = 'ls /tmp/')
        self.assertTrue(len(stdout) > 0)
        self.assertEqual(retcode, 0)

        # Now try this same with the shell off
        stdout, stderr, retcode = SubprocessAlgos.runCommand(cmd = ['ls', '/tmp/'])
        self.assertTrue(len(stdout) > 0)
        self.assertEqual(retcode, 0)

        # Now test and see if we can catch a non-zero return code and output
        stdout, stderr, retcode = SubprocessAlgos.runCommand(cmd = 'echo HELP; exit 5')
        self.assertEqual(stdout, 'HELP\n')
        self.assertEqual(stderr, '')
        self.assertEqual(retcode, 5)

        # Now see if we can do a timeout if the process takes too long
        self.assertRaises(SubprocessAlgos.SubprocessAlgoException,
                          SubprocessAlgos.runCommand, cmd = 'sleep 10', timeout = 1)

        # And this one will go on for too long, but not raise because waitTime is not an int
        SubprocessAlgos.runCommand(cmd = 'sleep 1', timeout = 0.1)
        return
Esempio n. 3
0
def submitWorker(input, results, timeout = None):
    """
    _outputWorker_

    Runs a subprocessed command.

    This takes whatever you send it (a single ID)
    executes the command
    and then returns the stdout result

    I planned this to do a glite-job-output command
    in massive parallel, possibly using the bulkID
    instead of the gridID.  Either way, all you have
    to change is the command here, and what is send in
    in the complete() function.
    """

    # Get this started
    while True:
        try:
            work = input.get()
        except (EOFError, IOError) as ex:
            crashMessage = "Hit EOF/IO in getting new work\n"
            crashMessage += "Assuming this is a graceful break attempt.\n"
            crashMessage += str(ex)
            logging.error(crashMessage)
            break
        except Exception as ex:
            msg =  "Hit unidentified exception getting work\n"
            msg += str(ex)
            msg += "Assuming everything's totally hosed.  Killing process.\n"
            logging.error(msg)
            break

        if work == 'STOP':
            # Put the brakes on
            logging.info("submitWorker multiprocess issued STOP command!")
            break

        command = work.get('command', None)
        idList  = work.get('idList', [])
        if not command:
            results.put({'stdout': '', 'stderr': '999100\n Got no command!', 'idList': idList})
            continue

        try:
            stdout, stderr, returnCode = SubprocessAlgos.runCommand(cmd = command, shell = True, timeout = timeout)
            if returnCode == 0:
                results.put({'stdout': stdout, 'stderr': stderr, 'idList': idList, 'exitCode': returnCode})
            else:
                results.put({'stdout': stdout,
                             'stderr': 'Non-zero exit code: %s\n stderr: %s' % (returnCode, stderr),
                             'exitCode': returnCode,
                             'idList': idList})
        except Exception as ex:
            msg =  "Critical error in subprocess while submitting to condor"
            msg += str(ex)
            msg += str(traceback.format_exc())
            logging.error(msg)
            results.put({'stdout': '', 'stderr': '999101\n %s' % msg, 'idList': idList, 'exitCode': 999101})

    return 0
Esempio n. 4
0
    def periodicUpdate(self):
        """
        Run on the defined intervals.

        """
        killProc = False

        if self.disableStep:
            # Then we aren't doing CPU monitoring
            # on this step
            return

        if self.currentStepName == None:
            # We're between steps
            return

        if self.currentStepSpace == None:
            # Then build the step space
            self.currentStepSpace = getStepSpace(self.stepHelper.name())

        stepPID = getStepPID(self.currentStepSpace, self.currentStepName)

        if stepPID == None:
            # Then we have no step PID, we can do nothing
            return

        # Now we run the monitor command and collate the data
        cmd = self.monitorBase % (stepPID, stepPID)
        stdout, stderr, retcode = subprocessAlgos.runCommand(cmd)

        output = stdout.split()
        if not len(output) > 7:
            # Then something went wrong in getting the ps data
            msg =  "Error when grabbing output from process ps\n"
            msg += "output = %s\n" % output
            msg += "command = %s\n" % self.monitorCommand
            logging.error(msg)
            return
        rss   = float(output[2])
        vsize = float(output[3])
        logging.info("Retrieved following performance figures:")
        logging.info("RSS: %s;  VSize: %s; PCPU: %s; PMEM: %s" % (output[2], output[3],
                                                                  output[4], output[5]))

        msg = 'Error in CMSSW step %s\n' % self.currentStepName
        if self.maxRSS != None and rss >= self.maxRSS:
            msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS
            msg += "Job has RSS: %s\n" % rss
            killProc = True
        if self.maxVSize != None and vsize >= self.maxVSize:
            msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize
            msg += "Job has VSize: %s\n" % vsize
            killProc = True


        if killProc:
            logging.error(msg)
            report  = Report.Report()
            # Find the global report
            logPath = os.path.join(self.currentStepSpace.location,
                                   '../../../',
                                   os.path.basename(self.logPath))
            try:
                if os.path.isfile(logPath):
                    # We should be able to find existant job report.
                    # If not, we're in trouble
                    logging.debug("Found pre-existant error report in DashboardMonitor termination.")
                    report.load(logPath)
                # Create a new step that won't be overridden by an exiting CMSSW
                if not report.retrieveStep(step = "PerformanceError"):
                    report.addStep(reportname = "PerformanceError")
                report.addError(stepName = "PerformanceError", exitCode = 99900,
                                errorType = "PerformanceKill", errorDetails = msg)
                report.save(logPath)
            except Exception, ex:
                # Basically, we can't write a log report and we're hosed
                # Kill anyway, and hope the logging file gets written out
                msg2 =  "Exception while writing out jobReport.\n"
                msg2 += "Aborting job anyway: unlikely you'll get any error report.\n"
                msg2 += str(ex)
                msg2 += str(traceback.format_exc()) + '\n'
                logging.error(msg2)

            try:
                logging.error("Attempting to kill job using SIGUSR2")
                os.kill(stepPID, signal.SIGUSR2)
            except Exception:
                os.kill(stepPID, signal.SIGTERM)
Esempio n. 5
0
            logging.error(msg)
            break

        if work == 'STOP':
            # Put the brakes on
            logging.info("submitWorker multiprocess issued STOP command!")
            break

        command = work.get('command', None)
        idList  = work.get('idList', [])
        if not command:
            results.put({'stdout': '', 'stderr': '999100\n Got no command!', 'idList': idList})
            continue

        try:
            stdout, stderr, returnCode = SubprocessAlgos.runCommand(cmd = command, shell = True, timeout = timeout)
            if returnCode == 0:
                results.put({'stdout': stdout, 'stderr': stderr, 'idList': idList, 'exitCode': returnCode})
            else:
                results.put({'stdout': stdout,
                             'stderr': 'Non-zero exit code: %s\n stderr: %s' % (returnCode, stderr),
                             'exitCode': returnCode,
                             'idList': idList})
        except Exception, ex:
            msg =  "Critical error in subprocess while submitting to condor"
            msg += str(ex)
            msg += str(traceback.format_exc())
            logging.error(msg)
            results.put({'stdout': '', 'stderr': '999101\n %s' % msg, 'idList': idList, 'exitCode': 999101})

    return 0
    def testD_MyProxyDelegation(self):
        """
        _MyProxyDelegation_

        Test whether we can delegate a proxy via myproxy to this job

        IMPORTANT:
        If you are going to run this test you will have to set the serverCert/Key
        config options to point to your local server cert.  You will also have to
        run this job with your DN.  I don't recommend figuring out how to do this
        without knowing what you're doing in regards to proxy stuff.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(nRunning, 0, "User currently has %i running jobs.  Test will not continue" % (nRunning))

        # Get the config and set the removal time to -10 for testing
        proxyDir = os.path.join(self.testDir, 'proxyDir')
        os.mkdir(proxyDir)
        config = self.getConfig()
        config.BossAir.removeTime = -10.0
        config.BossAir.pluginNames.append('VanillaCondorPlugin')
        config.BossAir.delegatedServerCert = '/uscms/home/mnorman/.globus/cms-xen39crab3devcert.pem'
        config.BossAir.delegatedServerKey  = '/uscms/home/mnorman/.globus/cms-xen39crab3devkey.pem'
        config.BossAir.myproxyServer       = 'myproxy.cern.ch'
        config.BossAir.proxyDir            = proxyDir
        config.BossAir.delegatedServerHash = 'a6f078516a0beed5dcb31ba866868fa690069f9a'

        userDN = '/DC=org/DC=doegrids/OU=People/CN=Matthew Norman 453632'

        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs = nJobs)

        baAPI  = BossAirAPI(config = config)


        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        for j in jobDummies:
            tmpJob = {'id': j['id']}
            tmpJob['custom']      = {'location': 'malpaquet'}
            tmpJob['name']        = j['name']
            tmpJob['cache_dir']   = self.testDir
            tmpJob['retry_count'] = 0
            tmpJob['plugin']      = 'VanillaCondorPlugin'
            tmpJob['owner']       = userDN
            tmpJob['packageDir']  = self.testDir
            tmpJob['sandbox']     = sandbox
            tmpJob['priority']    = None
            jobList.append(tmpJob)


        info = {}
        #info['packageDir'] = self.testDir
        info['index']      = 0
        info['sandbox']    = sandbox

        baAPI.submit(jobs = jobList, info = info)

        proxyFile = os.listdir(proxyDir)[0]
        stdout, stderr = SubprocessAlgos.runCommand(cmd = 'export X509_USER_PROXY=%s; voms-proxy-info' \
                                                    % os.path.join(proxyDir, proxyFile))
        self.assertEqual(stdout.split('\n')[0],
                         'subject   : %s/CN=proxy/CN=proxy/CN=proxy/CN=proxy' % userDN)

        # Now kill 'em manually
        command = ['condor_rm', self.user]
        SubprocessAlgos.runCommand(cmd = command, shell = False)

        return
Esempio n. 7
0
    def periodicUpdate(self):
        """
        Run on the defined intervals.

        """
        killProc = False
        killHard = False
        reason = ''
        errorCodeLookup = {'PSS': 50660,
                           'Wallclock time': 50664,
                           '': 99999}

        if self.disableStep:
            # Then we aren't doing CPU monitoring
            # on this step
            return

        if self.currentStepName is None:
            # We're between steps
            return

        if self.currentStepSpace is None:
            # Then build the step space
            self.currentStepSpace = getStepSpace(self.stepHelper.name())

        stepPID = getStepPID(self.currentStepSpace, self.currentStepName)

        if stepPID is None:
            # Then we have no step PID, we can do nothing
            return

        # Now we run the ps monitor command and collate the data
        # Gathers RSS, %CPU and %MEM statistics from ps
        ps_cmd = self.monitorBase % (stepPID, stepPID)
        stdout, _stderr, _retcode = subprocessAlgos.runCommand(ps_cmd)

        ps_output = stdout.split()
        if not len(ps_output) > 6:
            # Then something went wrong in getting the ps data
            msg = "Error when grabbing output from process ps\n"
            msg += "output = %s\n" % ps_output
            msg += "command = %s\n" % ps_cmd
            logging.error(msg)
            return

        # run the command to gather PSS memory statistics from /proc/<pid>/smaps
        smaps_cmd = self.pssMemoryCommand % (stepPID)
        stdout, _stderr, _retcode = subprocessAlgos.runCommand(smaps_cmd)

        smaps_output = stdout.split()
        if not len(smaps_output) == 1:
            # Then something went wrong in getting the smaps data
            msg = "Error when grabbing output from smaps\n"
            msg += "output = %s\n" % smaps_output
            msg += "command = %s\n" % smaps_cmd
            logging.error(msg)
            return

        # smaps also returns data in kiloBytes, let's make it megaBytes
        # I'm also confused with these megabytes and mebibytes...
        pss = int(smaps_output[0]) // 1000

        logging.info("PSS: %s; RSS: %s; PCPU: %s; PMEM: %s", smaps_output[0], ps_output[2], ps_output[3], ps_output[4])

        msg = 'Error in CMSSW step %s\n' % self.currentStepName
        msg += 'Number of Cores: %s\n' % self.numOfCores

        if self.maxPSS is not None and pss >= self.maxPSS:
            msg += "Job has exceeded maxPSS: %s MB\n" % self.maxPSS
            msg += "Job has PSS: %s MB\n" % pss
            killProc = True
            reason = 'PSS'
        elif self.hardTimeout is not None and self.softTimeout is not None:
            currentTime = time.time()
            if (currentTime - self.startTime) > self.softTimeout:
                killProc = True
                reason = 'Wallclock time'
                msg += "Job has been running for more than: %s\n" % str(self.softTimeout)
                msg += "Job has been running for: %s\n" % str(currentTime - self.startTime)
            if (currentTime - self.startTime) > self.hardTimeout:
                killHard = True
                msg += "Job exceeded soft timeout"

        if not killProc:
            # then job is behaving well, there is nothing to do
            return

        # make sure we persist the performance error only once
        if not self.killRetry:
            logging.error(msg)
            report = Report.Report()
            # Find the global report
            logPath = os.path.join(self.currentStepSpace.location,
                                   '../../../',
                                   os.path.basename(self.logPath))
            try:
                if os.path.isfile(logPath):
                    # We should be able to find existant job report.
                    # If not, we're in trouble
                    logging.debug("Found pre-existant error report in PerformanceMonitor termination.")
                    report.load(logPath)
                # Create a new step that won't be overridden by an exiting CMSSW
                if not report.retrieveStep(step="PerformanceError"):
                    report.addStep(reportname="PerformanceError")
                report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason],
                                errorType="PerformanceKill", errorDetails=msg)
                report.save(logPath)
            except Exception as ex:
                # Basically, we can't write a log report and we're hosed
                # Kill anyway, and hope the logging file gets written out
                msg2 = "Exception while writing out jobReport.\n"
                msg2 += "Aborting job anyway: unlikely you'll get any error report.\n"
                msg2 += "Error: %s" % str(ex)
                logging.exception(msg2)

        try:
            if not killHard and not self.killRetry:
                logging.error("Attempting to kill step using SIGUSR2")
                os.kill(stepPID, signal.SIGUSR2)
            else:
                logging.error("Attempting to kill step using SIGTERM")
                os.kill(stepPID, signal.SIGTERM)
        except Exception:
            logging.error("Attempting to kill step using SIGTERM")
            os.kill(stepPID, signal.SIGTERM)
        finally:
            self.killRetry = True

        return
Esempio n. 8
0
    def periodicUpdate(self):
        """
        Run on the defined intervals.

        """
        killProc = False
        killHard = False

        if self.disableStep:
            # Then we aren't doing CPU monitoring
            # on this step
            return

        if self.currentStepName == None:
            # We're between steps
            return

        if self.currentStepSpace == None:
            # Then build the step space
            self.currentStepSpace = getStepSpace(self.stepHelper.name())

        stepPID = getStepPID(self.currentStepSpace, self.currentStepName)

        if stepPID == None:
            # Then we have no step PID, we can do nothing
            return

        # Now we run the monitor command and collate the data
        cmd = self.monitorBase % (stepPID, stepPID)
        stdout, stderr, retcode = subprocessAlgos.runCommand(cmd)

        output = stdout.split()
        if not len(output) > 7:
            # Then something went wrong in getting the ps data
            msg = "Error when grabbing output from process ps\n"
            msg += "output = %s\n" % output
            msg += "command = %s\n" % self.monitorCommand
            logging.error(msg)
            return
        rss = float(output[2])
        vsize = float(output[3])
        logging.info("Retrieved following performance figures:")
        logging.info("RSS: %s;  VSize: %s; PCPU: %s; PMEM: %s" %
                     (output[2], output[3], output[4], output[5]))

        msg = 'Error in CMSSW step %s\n' % self.currentStepName
        if self.maxRSS != None and rss >= self.maxRSS:
            msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS
            msg += "Job has RSS: %s\n" % rss
            killProc = True
        if self.maxVSize != None and vsize >= self.maxVSize:
            msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize
            msg += "Job has VSize: %s\n" % vsize
            killProc = True

        #Let's check the running time
        currentTime = time.time()

        if self.hardTimeout != None and self.softTimeout != None:
            if (currentTime - self.startTime) > self.softTimeout:
                killProc = True
                msg += "Job has been running for more than: %s\n" % str(
                    self.softTimeout)
                msg += "Job has been running for: %s\n" % str(currentTime -
                                                              self.startTime)
            if (currentTime - self.startTime) > self.hardTimeout:
                killHard = True
                msg += "Job exceeded soft timeout"

        if killProc:
            logging.error(msg)
            report = Report.Report()
            # Find the global report
            logPath = os.path.join(self.currentStepSpace.location, '../../../',
                                   os.path.basename(self.logPath))
            try:
                if os.path.isfile(logPath):
                    # We should be able to find existant job report.
                    # If not, we're in trouble
                    logging.debug(
                        "Found pre-existant error report in PerformanceMonitor termination."
                    )
                    report.load(logPath)
                # Create a new step that won't be overridden by an exiting CMSSW
                if not report.retrieveStep(step="PerformanceError"):
                    report.addStep(reportname="PerformanceError")
                report.addError(stepName="PerformanceError",
                                exitCode=99900,
                                errorType="PerformanceKill",
                                errorDetails=msg)
                report.save(logPath)
            except Exception, ex:
                # Basically, we can't write a log report and we're hosed
                # Kill anyway, and hope the logging file gets written out
                msg2 = "Exception while writing out jobReport.\n"
                msg2 += "Aborting job anyway: unlikely you'll get any error report.\n"
                msg2 += str(ex)
                msg2 += str(traceback.format_exc()) + '\n'
                logging.error(msg2)

            try:
                if not killHard:
                    logging.error("Attempting to kill step using SIGUSR2")
                    os.kill(stepPID, signal.SIGUSR2)
                else:
                    logging.error("Attempting to kill step using SIGTERM")
                    os.kill(stepPID, signal.SIGTERM)
            except Exception:
                logging.error("Attempting to kill step using SIGTERM")
                os.kill(stepPID, signal.SIGTERM)
Esempio n. 9
0
    def periodicUpdate(self):
        """
        Run on the defined intervals.

        """
        killProc = False
        killHard = False
        reason = ''
        errorCodeLookup = {'RSS': 50660,
                           'VSZ': 50661,
                           'Wallclock time': 50664,
                           '': 99999}

        if self.disableStep:
            # Then we aren't doing CPU monitoring
            # on this step
            return

        if self.currentStepName is None:
            # We're between steps
            return

        if self.currentStepSpace is None:
            # Then build the step space
            self.currentStepSpace = getStepSpace(self.stepHelper.name())

        stepPID = getStepPID(self.currentStepSpace, self.currentStepName)

        if stepPID is None:
            # Then we have no step PID, we can do nothing
            return

        # Now we run the monitor command and collate the data
        cmd = self.monitorBase % (stepPID, stepPID)
        stdout, stderr, retcode = subprocessAlgos.runCommand(cmd)

        output = stdout.split()
        if not len(output) > 7:
            # Then something went wrong in getting the ps data
            msg = "Error when grabbing output from process ps\n"
            msg += "output = %s\n" % output
            msg += "command = %s\n" % self.monitorCommand
            logging.error(msg)
            return
        # FIXME: making it backwards compatible. Keep only the "else" block in HG1801
        if self.maxRSS is not None and self.maxRSS >= (1024 * 1024):
            # then workload value is still in KiB (old way)
            rss = int(output[2])
            vsize = int(output[3])
        else:
            rss = int(output[2]) // 1024  # convert it to MiB
            vsize = int(output[3]) // 1024  # convert it to MiB
        logging.info("Retrieved following performance figures:")
        logging.info("RSS: %s;  VSize: %s; PCPU: %s; PMEM: %s", output[2], output[3],
                     output[4], output[5])

        msg = 'Error in CMSSW step %s\n' % self.currentStepName
        msg += 'Number of Cores: %s\n' % self.numOfCores

        if self.maxRSS is not None and rss >= self.maxRSS:
            msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS
            msg += "Job has RSS: %s\n" % rss
            killProc = True
            reason = 'RSS'
        elif self.maxVSize is not None and vsize >= self.maxVSize:
            msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize
            msg += "Job has VSize: %s\n" % vsize
            killProc = True
            reason = 'VSZ'
        elif self.hardTimeout is not None and self.softTimeout is not None:
            currentTime = time.time()
            if (currentTime - self.startTime) > self.softTimeout:
                killProc = True
                reason = 'Wallclock time'
                msg += "Job has been running for more than: %s\n" % str(self.softTimeout)
                msg += "Job has been running for: %s\n" % str(currentTime - self.startTime)
            if (currentTime - self.startTime) > self.hardTimeout:
                killHard = True
                msg += "Job exceeded soft timeout"

        if killProc:
            logging.error(msg)
            report = Report.Report()
            # Find the global report
            logPath = os.path.join(self.currentStepSpace.location,
                                   '../../../',
                                   os.path.basename(self.logPath))
            try:
                if os.path.isfile(logPath):
                    # We should be able to find existant job report.
                    # If not, we're in trouble
                    logging.debug("Found pre-existant error report in PerformanceMonitor termination.")
                    report.load(logPath)
                # Create a new step that won't be overridden by an exiting CMSSW
                if not report.retrieveStep(step="PerformanceError"):
                    report.addStep(reportname="PerformanceError")
                report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason],
                                errorType="PerformanceKill", errorDetails=msg)
                report.save(logPath)
            except Exception as ex:
                # Basically, we can't write a log report and we're hosed
                # Kill anyway, and hope the logging file gets written out
                msg2 = "Exception while writing out jobReport.\n"
                msg2 += "Aborting job anyway: unlikely you'll get any error report.\n"
                msg2 += str(ex)
                msg2 += str(traceback.format_exc()) + '\n'
                logging.error(msg2)

            try:
                if not killHard:
                    logging.error("Attempting to kill step using SIGUSR2")
                    os.kill(stepPID, signal.SIGUSR2)
                else:
                    logging.error("Attempting to kill step using SIGTERM")
                    os.kill(stepPID, signal.SIGTERM)
            except Exception:
                logging.error("Attempting to kill step using SIGTERM")
                os.kill(stepPID, signal.SIGTERM)

        return
Esempio n. 10
0
def submitWorker(input, results, timeout = None):
    """
    _outputWorker_

    Runs a subprocessed command.

    This takes whatever you send it (a single ID)
    executes the command
    and then returns the stdout result

    I planned this to do a glite-job-output command
    in massive parallel, possibly using the bulkID
    instead of the gridID.  Either way, all you have
    to change is the command here, and what is send in
    in the complete() function.
    """

    # Get this started
    while True:
        try:
            work = input.get()
        except (EOFError, IOError) as ex:
            crashMessage = "Hit EOF/IO in getting new work\n"
            crashMessage += "Assuming this is a graceful break attempt.\n"
            crashMessage += str(ex)
            logging.error(crashMessage)
            break
        except Exception as ex:
            msg =  "Hit unidentified exception getting work\n"
            msg += str(ex)
            msg += "Assuming everything's totally hosed.  Killing process.\n"
            logging.error(msg)
            break

        if work == 'STOP':
            # Put the brakes on
            logging.info("submitWorker multiprocess issued STOP command!")
            break

        command = work.get('command', None)
        idList  = work.get('idList', [])
        if not command:
            results.put({'stdout': '', 'stderr': '999100\n Got no command!', 'idList': idList})
            continue

        try:
            stdout, stderr, returnCode = SubprocessAlgos.runCommand(cmd = command, shell = True, timeout = timeout)
            if returnCode == 0:
                results.put({'stdout': stdout, 'stderr': stderr, 'idList': idList, 'exitCode': returnCode})
            else:
                results.put({'stdout': stdout,
                             'stderr': 'Non-zero exit code: %s\n stderr: %s' % (returnCode, stderr),
                             'exitCode': returnCode,
                             'idList': idList})
        except Exception as ex:
            msg =  "Critical error in subprocess while submitting to condor"
            msg += str(ex)
            msg += str(traceback.format_exc())
            logging.error(msg)
            results.put({'stdout': '', 'stderr': '999101\n %s' % msg, 'idList': idList, 'exitCode': 999101})

    return 0
Esempio n. 11
0
            logging.error(msg)
            break

        if work == 'STOP':
            # Put the brakes on
            logging.info("submitWorker multiprocess issued STOP command!")
            break

        command = work.get('command', None)
        idList  = work.get('idList', [])
        if not command:
            results.put({'stdout': '', 'stderr': '999100\n Got no command!', 'idList': idList})
            continue

        try:
            stdout, stderr, returnCode = SubprocessAlgos.runCommand(cmd = command, shell = True, timeout = timeout)
            if returnCode == 0:
                results.put({'stdout': stdout, 'stderr': stderr, 'idList': idList, 'exitCode': returnCode})
            else:
                results.put({'stdout': stdout,
                             'stderr': 'Non-zero exit code: %s\n stderr: %s' % (returnCode, stderr),
                             'exitCode': returnCode,
                             'idList': idList})
        except Exception, ex:
            msg =  "Critical error in subprocess while submitting to condor"
            msg += str(ex)
            msg += str(traceback.format_exc())
            logging.error(msg)
            results.put({'stdout': '', 'stderr': '999101\n %s' % msg, 'idList': idList, 'exitCode': 999101})

    return 0
Esempio n. 12
0
    def testD_MyProxyDelegation(self):
        """
        _MyProxyDelegation_

        Test whether we can delegate a proxy via myproxy to this job

        IMPORTANT:
        If you are going to run this test you will have to set the serverCert/Key
        config options to point to your local server cert.  You will also have to
        run this job with your DN.  I don't recommend figuring out how to do this
        without knowing what you're doing in regards to proxy stuff.
        """

        nRunning = getCondorRunningJobs(self.user)
        self.assertEqual(
            nRunning, 0,
            "User currently has %i running jobs.  Test will not continue" %
            (nRunning))

        # Get the config and set the removal time to -10 for testing
        proxyDir = os.path.join(self.testDir, 'proxyDir')
        os.mkdir(proxyDir)
        config = self.getConfig()
        config.BossAir.removeTime = -10.0
        config.BossAir.pluginNames.append('VanillaCondorPlugin')
        config.BossAir.delegatedServerCert = '/uscms/home/mnorman/.globus/cms-xen39crab3devcert.pem'
        config.BossAir.delegatedServerKey = '/uscms/home/mnorman/.globus/cms-xen39crab3devkey.pem'
        config.BossAir.myproxyServer = 'myproxy.cern.ch'
        config.BossAir.proxyDir = proxyDir
        config.BossAir.delegatedServerHash = 'a6f078516a0beed5dcb31ba866868fa690069f9a'

        userDN = '/DC=org/DC=doegrids/OU=People/CN=Matthew Norman 453632'

        nJobs = 10

        jobDummies = self.createDummyJobs(nJobs=nJobs)

        baAPI = BossAirAPI(config=config)

        jobPackage = os.path.join(self.testDir, 'JobPackage.pkl')
        f = open(jobPackage, 'w')
        f.write(' ')
        f.close()

        sandbox = os.path.join(self.testDir, 'sandbox.box')
        f = open(sandbox, 'w')
        f.write(' ')
        f.close()

        jobList = []
        for j in jobDummies:
            tmpJob = {'id': j['id']}
            tmpJob['custom'] = {'location': 'malpaquet'}
            tmpJob['name'] = j['name']
            tmpJob['cache_dir'] = self.testDir
            tmpJob['retry_count'] = 0
            tmpJob['plugin'] = 'VanillaCondorPlugin'
            tmpJob['owner'] = userDN
            tmpJob['packageDir'] = self.testDir
            tmpJob['sandbox'] = sandbox
            tmpJob['priority'] = None
            jobList.append(tmpJob)

        info = {}
        #info['packageDir'] = self.testDir
        info['index'] = 0
        info['sandbox'] = sandbox

        baAPI.submit(jobs=jobList, info=info)

        proxyFile = os.listdir(proxyDir)[0]
        stdout, stderr = SubprocessAlgos.runCommand(cmd = 'export X509_USER_PROXY=%s; voms-proxy-info' \
                                                    % os.path.join(proxyDir, proxyFile))
        self.assertEqual(
            stdout.split('\n')[0],
            'subject   : %s/CN=proxy/CN=proxy/CN=proxy/CN=proxy' % userDN)

        # Now kill 'em manually
        command = ['condor_rm', self.user]
        SubprocessAlgos.runCommand(cmd=command, shell=False)

        return
Esempio n. 13
0
    def periodicUpdate(self):
        """
        Run on the defined intervals.

        """
        killProc = False
        killHard = False
        reason = ''
        errorCodeLookup = {
            'RSS': 50660,
            'VSZ': 50661,
            'Wallclock time': 50664,
            '': 99999
        }

        if self.disableStep:
            # Then we aren't doing CPU monitoring
            # on this step
            return

        if self.currentStepName is None:
            # We're between steps
            return

        if self.currentStepSpace is None:
            # Then build the step space
            self.currentStepSpace = getStepSpace(self.stepHelper.name())

        stepPID = getStepPID(self.currentStepSpace, self.currentStepName)

        if stepPID is None:
            # Then we have no step PID, we can do nothing
            return

        # Now we run the monitor command and collate the data
        cmd = self.monitorBase % (stepPID, stepPID)
        stdout, stderr, retcode = subprocessAlgos.runCommand(cmd)

        output = stdout.split()
        if not len(output) > 7:
            # Then something went wrong in getting the ps data
            msg = "Error when grabbing output from process ps\n"
            msg += "output = %s\n" % output
            msg += "command = %s\n" % self.monitorCommand
            logging.error(msg)
            return
        # FIXME: making it backwards compatible. Keep only the "else" block in HG1801
        if self.maxRSS is not None and self.maxRSS >= (1024 * 1024):
            # then workload value is still in KiB (old way)
            rss = int(output[2])
            vsize = int(output[3])
        else:
            # ps returns data in kiloBytes, let's make it megaBytes
            # I'm so confused with these megabytes and mebibytes...
            rss = int(output[2]) // 1000  # convert it to MiB
            vsize = int(output[3]) // 1000  # convert it to MiB
        logging.info("Retrieved following performance figures:")
        logging.info("RSS: %s;  VSize: %s; PCPU: %s; PMEM: %s", output[2],
                     output[3], output[4], output[5])

        msg = 'Error in CMSSW step %s\n' % self.currentStepName
        msg += 'Number of Cores: %s\n' % self.numOfCores

        if self.maxRSS is not None and rss >= self.maxRSS:
            msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS
            msg += "Job has RSS: %s\n" % rss
            killProc = True
            reason = 'RSS'
        elif self.maxVSize is not None and vsize >= self.maxVSize:
            msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize
            msg += "Job has VSize: %s\n" % vsize
            killProc = True
            reason = 'VSZ'
        elif self.hardTimeout is not None and self.softTimeout is not None:
            currentTime = time.time()
            if (currentTime - self.startTime) > self.softTimeout:
                killProc = True
                reason = 'Wallclock time'
                msg += "Job has been running for more than: %s\n" % str(
                    self.softTimeout)
                msg += "Job has been running for: %s\n" % str(currentTime -
                                                              self.startTime)
            if (currentTime - self.startTime) > self.hardTimeout:
                killHard = True
                msg += "Job exceeded soft timeout"

        if killProc:
            logging.error(msg)
            report = Report.Report()
            # Find the global report
            logPath = os.path.join(self.currentStepSpace.location, '../../../',
                                   os.path.basename(self.logPath))
            try:
                if os.path.isfile(logPath):
                    # We should be able to find existant job report.
                    # If not, we're in trouble
                    logging.debug(
                        "Found pre-existant error report in PerformanceMonitor termination."
                    )
                    report.load(logPath)
                # Create a new step that won't be overridden by an exiting CMSSW
                if not report.retrieveStep(step="PerformanceError"):
                    report.addStep(reportname="PerformanceError")
                report.addError(stepName="PerformanceError",
                                exitCode=errorCodeLookup[reason],
                                errorType="PerformanceKill",
                                errorDetails=msg)
                report.save(logPath)
            except Exception as ex:
                # Basically, we can't write a log report and we're hosed
                # Kill anyway, and hope the logging file gets written out
                msg2 = "Exception while writing out jobReport.\n"
                msg2 += "Aborting job anyway: unlikely you'll get any error report.\n"
                msg2 += str(ex)
                msg2 += str(traceback.format_exc()) + '\n'
                logging.error(msg2)

            try:
                if not killHard:
                    logging.error("Attempting to kill step using SIGUSR2")
                    os.kill(stepPID, signal.SIGUSR2)
                else:
                    logging.error("Attempting to kill step using SIGTERM")
                    os.kill(stepPID, signal.SIGTERM)
            except Exception:
                logging.error("Attempting to kill step using SIGTERM")
                os.kill(stepPID, signal.SIGTERM)

        return