Example #1
0
    def postRun(self, job):
        """
        Called after a job has finished running.

        This method will call derived postLaunchCommand to obtain
        a possible command to run, once this group of jobs has
        launched.
        """
        tester = job.getTester()
        launchable_jobs = []

        # Check if all jobs in this group are pending (meaning they were launched)
        if tester.isQueued() and not self.__status_check:
            current_dag = job.getDAG()

            # If current_dag is empty this is the last job to have been removed
            if current_dag.size() == 0:
                original_dag = job.getOriginalDAG()

                for ind_job in original_dag.topological_sort():
                    launchable_jobs.append(ind_job)

        # Ask derived scheduler to build a postRun command
        if launchable_jobs:
            command = self.postLaunchCommand(launchable_jobs)
            if command:
                util.runCommand(command)
Example #2
0
    def postRun(self, job):
        """
        Called after a job has finished running.

        This method will call derived postLaunchCommand to obtain
        a possible command to run, once this group of jobs has
        launched.
        """
        tester = job.getTester()
        launchable_jobs = []

        # Check if all jobs in this group are pending (meaning they were launched)
        if tester.isQueued() and not self.__status_check:
            current_dag = job.getDAG()

            # If current_dag is empty this is the last job to have been removed
            if current_dag.size() == 0:
                original_dag = job.getOriginalDAG()

                for ind_job in original_dag.topological_sort():
                    launchable_jobs.append(ind_job)

        # Ask derived scheduler to build a postRun command
        if launchable_jobs:
            command = self.postLaunchCommand(launchable_jobs)
            if command:
                util.runCommand(command)
Example #3
0
 def postLaunch(self, jobs):
     """ Release jobs that were placed on hold """
     print('releasing jobs...')
     # TODO: Do something else, than this one-at-a-time stuff.
     #       Also we should be returning a command to run, and not calling runCommand ourselves.
     for job_container in jobs:
         test_unique = self.getUnique(job_container)
         json_session = self.getData(test_unique, job_id=True)
         util.runCommand('qrls %s' % (json_session['job_id']))
Example #4
0
    def processResults(self, moose_dir, options, output):
        FileTester.processResults(self, moose_dir, options, output)

        if self.isFail() or self.specs['skip_checks']:
            return output

        # Don't Run Exodiff on Scaled Tests
        if options.scaling and self.specs['scale_refine']:
            return output

        # Make sure that all of the Exodiff files are actually available
        for file in self.specs['exodiff']:
            if not os.path.exists(os.path.join(self.specs['test_dir'], self.specs['gold_dir'], file)):
                output += "File Not Found: " + os.path.join(self.specs['test_dir'], self.specs['gold_dir'], file)
                self.setStatus(self.fail, 'MISSING GOLD FILE')
                break

        if not self.isFail():
            # Retrieve the commands
            commands = self.processResultsCommand(moose_dir, options)

            for command in commands:
                exo_output = util.runCommand(command)

                output += 'Running exodiff: ' + command + '\n' + exo_output + ' ' + ' '.join(self.specs['exodiff_opts'])

                if ('different' in exo_output or 'ERROR' in exo_output) and not "Files are the same" in exo_output:
                    self.setStatus(self.diff, 'EXODIFF')
                    break

        return output
Example #5
0
 def executeAndGetJobOutput(self, job):
     """
     Execute derived commands and obtain the output
     """
     json_data = self.getData(job.getUniqueIdentifier(), working_dir=True)
     output = util.runCommand(self.getQueueCommand(job), cwd=json_data['working_dir'])
     return output
Example #6
0
    def processResults(self, moose_dir, options, output):
        FileTester.processResults(self, moose_dir, options, output)

        if self.isFail() or self.specs['skip_checks']:
            return output

        # Don't Run CSVDiff on Scaled Tests
        if options.scaling and self.specs['scale_refine']:
            return output

        # Make sure that all of the Exodiff files are actually available
        for file in self.specs['csvdiff']:
            if not os.path.exists(
                    os.path.join(self.specs['test_dir'],
                                 self.specs['gold_dir'], file)):
                output += "File Not Found: " + os.path.join(
                    self.specs['test_dir'], self.specs['gold_dir'], file)
                self.setStatus(self.fail, 'MISSING GOLD FILE')
                break

        if not self.isFail():
            # Retrieve the commands
            commands = self.processResultsCommand(moose_dir, options)

            for command in commands:
                exo_output = util.runCommand(command)
                output += 'Running csvdiff: ' + command + '\n' + exo_output
                if not "Files are the same" in exo_output:
                    self.setStatus(self.diff, 'CSVDIFF')
                    break

        return output
Example #7
0
    def processResults(self, moose_dir, options, output):
        output += FileTester.processResults(self, moose_dir, options, output)

        if self.isFail() or self.specs['skip_checks']:
            return output

        # Don't Run JSONDiff on Scaled Tests
        if options.scaling and self.specs['scale_refine']:
            return output

        # Check if files exist
        for file in self.specs['jsondiff']:
            # Get file names and error if not found
            gold_file = os.path.join(self.getTestDir(), self.specs['gold_dir'], file)
            test_file = os.path.join(self.getTestDir(), file)
            if not os.path.exists(gold_file):
                output += "File Not Found: " + gold_file
                self.setStatus(self.fail, 'MISSING GOLD FILE')
            if not os.path.exists(test_file):
                output += "File Not Found: " + test_file
                self.setStatus(self.fail, 'MISSING OUTPUT FILE')

        if not self.isFail():
            commands = self.processResultsCommand(moose_dir, options)
            for command in commands:
                exo_output = util.runCommand(command)
                output += 'Running jsondiff: ' + command + '\n' + exo_output
                if not "Files are the same" in exo_output:
                    self.setStatus(self.diff, 'JSONDIFF')

        return output
Example #8
0
 def executeAndGetJobOutput(self, job):
     """
     Execute derived commands and obtain the output
     """
     json_data = self.getData(job.getUniqueIdentifier(), working_dir=True)
     output = util.runCommand(self.getQueueCommand(job),
                              cwd=json_data['working_dir'])
     return output
Example #9
0
    def processJobs(self, jobs):
        """
        Perform status checks on 'jobs' already launched.

        This method is special in the fact that it will iterate over
        a reversed list of concurrent jobs generated by the DAG object
        contained in 'jobs' rather than the supplied ordered list of
        'jobs'. This also means we need to handle past jobs that were
        skipped separately (skipped tests will not be contained in the
        DAG).
        """
        launched_dags = []
        for job_container in jobs:
            tester = job_container.getTester()
            test_unique = self.getUnique(job_container)

            # Print the skipped variety tests now, which will not be in the DAG.
            if tester.isSkipped() or tester.isSilent() or tester.isDeleted():
                self.statusJobs([job_container])
                continue

            # Only interested in jobs contained in the current session
            if test_unique not in self.__session_data.keys():
                continue

            # Get the DAGs we need
            job_dag = job_container.getDAG()
            original_dag = job_container.getOriginalDAG()
            reverse_dag = original_dag.reverse_edges()

            # job_dag is unique and shared amongst a group of job_containers. We only want
            # to work with one of these DAGs per group. So this is how we do it.
            if job_dag in launched_dags:
                continue
            launched_dags.append(job_dag)

            # Run the postProcess checks in reverse order
            for original_job_container in reverse_dag.topological_sort():
                # Use a previous statuses whenever possible
                if self.__useImmediateStatus(original_job_container):
                    self.statusJobs([original_job_container])
                    continue

                # Generate and run appropriate command for current state of operation
                queue_command = self.getQueueCommand(original_job_container)
                output = util.runCommand(queue_command)

                # Call derived methods to set a job status
                self.handleJobStatus(original_job_container, output)

                # Print the results
                self.statusJobs([original_job_container])
Example #10
0
    def processJobs(self, jobs):
        """
        Perform status checks on 'jobs' already launched.

        This method is special in the fact that it will iterate over
        a reversed list of concurrent jobs generated by the DAG object
        contained in 'jobs' rather than the supplied ordered list of
        'jobs'. This also means we need to handle past jobs that were
        skipped separately (skipped tests will not be contained in the
        DAG).
        """
        launched_dags = []
        for job_container in jobs:
            tester = job_container.getTester()
            test_unique = self.getUnique(job_container)

            # Print the skipped variety tests now, which will not be in the DAG.
            if tester.isSkipped() or tester.isSilent() or tester.isDeleted():
                self.statusJobs([job_container])
                continue

            # Only interested in jobs contained in the current session
            if test_unique not in self.__session_data.keys():
                continue

            # Get the DAGs we need
            job_dag = job_container.getDAG()
            original_dag = job_container.getOriginalDAG()
            reverse_dag = original_dag.reverse_edges()

            # job_dag is unique and shared amongst a group of job_containers. We only want
            # to work with one of these DAGs per group. So this is how we do it.
            if job_dag in launched_dags:
                continue
            launched_dags.append(job_dag)

            # Run the postProcess checks in reverse order
            for original_job_container in reverse_dag.topological_sort():
                # Use a previous statuses whenever possible
                if self.__useImmediateStatus(original_job_container):
                    self.statusJobs([original_job_container])
                    continue

                # Generate and run appropriate command for current state of operation
                queue_command = self.getQueueCommand(original_job_container)
                output = util.runCommand(queue_command)

                # Call derived methods to set a job status
                self.handleJobStatus(original_job_container, output)

                # Print the results
                self.statusJobs([original_job_container])
Example #11
0
    def hasTimedOutOrFailed(self, job_data):
        """ use qstat and return bool on job failures outside of the TestHarness's control """
        launch_id = job_data.json_data.get(job_data.job_dir,
                                           {}).get(job_data.plugin,
                                                   {}).get('ID',
                                                           "").split('.')[0]

        # We shouldn't run into a null, but just in case, lets handle it
        if launch_id:
            qstat_command_result = util.runCommand('qstat -xf %s' %
                                                   (launch_id))

            # handle a qstat execution failure for some reason
            if qstat_command_result.find('ERROR') != -1:
                # set error for each job contained in group
                for job in job_data.jobs.getJobs():
                    job.setOutput('ERROR invoking `qstat`\n%s' %
                                  (qstat_command_result))
                    job.setStatus(job.error, 'QSTAT')
                return True

            qstat_job_result = re.findall(r'Exit_status = (\d+)',
                                          qstat_command_result)

            # woops. This job was killed by PBS by exceeding walltime
            if qstat_job_result and qstat_job_result[0] == "271":
                for job in job_data.jobs.getJobs():
                    job.addCaveats('Killed by PBS Exceeded Walltime')
                return True

            # Capture TestHarness exceptions
            elif qstat_job_result and qstat_job_result[0] != "0":

                # Try and gather some useful output we can tack on to one of the job objects
                output_file = job_data.json_data.get(job_data.job_dir, {}).get(
                    job_data.plugin, {}).get('QSUB_OUTPUT', "")
                if os.path.exists(output_file):
                    with open(output_file, 'r') as f:
                        output_string = util.readOutput(
                            f, None,
                            job_data.jobs.getJobs()[0].getTester())
                    job_data.jobs.getJobs()[0].setOutput(output_string)

                # Add a caveat to each job, explaining that one of the jobs caused a TestHarness exception
                for job in job_data.jobs.getJobs():
                    job.addCaveats('TESTHARNESS EXCEPTION')
                return True
Example #12
0
    def run(self, job):
        """ execute qsub and return the launch id """
        template = self._augmentTemplate(job)
        tester = job.getTester()

        self.createQueueScript(job, template)

        command = ' '.join(['qsub', template['launch_script']])
        launch_results = util.runCommand(command, job.getTestDir())

        # List of files we need to clean up when we are done
        dirty_files = [template['launch_script'], template['output']]

        self.addDirtyFiles(job, dirty_files)

        if launch_results.find('ERROR') != -1:
            # The executor job failed (so fail all jobs in this group)
            job_dag = job.getDAG()

            for other_job in [
                    x for x in job_dag.topological_sort() if x != job
            ]:
                other_job.clearCaveats()
                other_tester = other_job.getTester()
                other_tester.setStatus(other_tester.fail, 'launch failure')

            # This is _only_ to make the failed message more useful
            tester.specs['test_dir'] = ''
            tester.specs['command'] = command
            tester.setStatus(tester.fail, 'QSUB Group Failure')
            job.setOutput(launch_results)

        else:
            job.addMetaData(
                RunPBS={
                    'ID': launch_results,
                    'QSUB_COMMAND': command,
                    'NCPUS': template['mpi_procs'],
                    'WALLTIME': template['walltime'],
                    'QSUB_OUTPUT': template['output']
                })
            tester.setStatus(tester.no_status, 'LAUNCHING')
Example #13
0
    def processResults(self, moose_dir, options, output):
        FileTester.processResults(self, moose_dir, options, output)

        if self.getStatus() == self.bucket_fail or self.specs['skip_checks']:
            return output

        # Don't Run Exodiff on Scaled Tests
        if options.scaling and self.specs['scale_refine']:
            self.success_message = "SCALED"
            self.setStatus(self.getSuccessMessage(), self.bucket_success)
            return output

        # Make sure that all of the Exodiff files are actually available
        for file in self.specs['exodiff']:
            if not os.path.exists(
                    os.path.join(self.specs['test_dir'],
                                 self.specs['gold_dir'], file)):
                output += "File Not Found: " + os.path.join(
                    self.specs['test_dir'], self.specs['gold_dir'], file)
                self.setStatus('MISSING GOLD FILE', self.bucket_fail)
                break

        if self.getStatus() != self.bucket_fail:
            # Retrieve the commands
            commands = self.processResultsCommand(moose_dir, options)

            for command in commands:
                exo_output = util.runCommand(command)

                output += 'Running exodiff: ' + command + '\n' + exo_output + ' ' + ' '.join(
                    self.specs['exodiff_opts'])

                if ('different' in exo_output or 'ERROR' in exo_output
                    ) and not "Files are the same" in exo_output:
                    self.setStatus('EXODIFF', self.bucket_diff)
                    break

        # If status is still pending, then it is a passing test
        if self.getStatus() == self.bucket_pending:
            self.setStatus(self.success_message, self.bucket_success)

        return output
Example #14
0
    def hasTimedOutOrFailed(self, job_data):
        """ use qstat and return bool on job failures outside of the TestHarness's control """
        launch_id = job_data.json_data.get(job_data.job_dir,
                                           {}).get(job_data.plugin,
                                                   {}).get('ID', "").split('.')[0]

        # We shouldn't run into a null, but just in case, lets handle it
        if launch_id:
            qstat_command_result = util.runCommand('qstat -xf %s' % (launch_id))

            # handle a qstat execution failure for some reason
            if qstat_command_result.find('ERROR') != -1:
                # set error for each job contained in group
                for job in job_data.jobs.getJobs():
                    job.setOutput('ERROR invoking `qstat`\n%s' % (qstat_command_result))
                    job.setStatus(job.error, 'QSTAT')
                return True

            qstat_job_result = re.findall(r'Exit_status = (\d+)', qstat_command_result)

            # woops. This job was killed by PBS by exceeding walltime
            if qstat_job_result and qstat_job_result[0] == "271":
                for job in job_data.jobs.getJobs():
                    job.addCaveats('Killed by PBS Exceeded Walltime')
                return True

            # Capture TestHarness exceptions
            elif qstat_job_result and qstat_job_result[0] != "0":

                # Try and gather some useful output we can tack on to one of the job objects
                output_file = job_data.json_data.get(job_data.job_dir, {}).get(job_data.plugin, {}).get('QSUB_OUTPUT', "")
                if os.path.exists(output_file):
                    with open(output_file, 'r') as f:
                        output_string = util.readOutput(f, None, self.options)
                    job_data.jobs.getJobs()[0].setOutput(output_string)

                # Add a caveat to each job, explaining that one of the jobs caused a TestHarness exception
                for job in job_data.jobs.getJobs():
                    job.addCaveats('TESTHARNESS EXCEPTION')
                return True
Example #15
0
    def runJobs(self, jobs):
        """ Queue list of jobs to run """
        for job_container in jobs:
            # store launched jobs so we can use it when TestHarness calls waitFinish.
            self.__jobs.add(job_container)

            # Augment queue parameters
            template = self.augmentQueueParamsBase(job_container)

            # Prepare the worker directory
            self.__copyFiles(job_container, template)

            # Write the execution file
            self.__prepareQueueScript(template)

            # Save template information
            test_unique = self.getUnique(job_container)
            self.putData(test_unique,
                         queue_script=template['queue_script'],
                         working_dir=template['working_dir'],
                         job_name=template['job_name'])

            # Get derived launch command and launch this job (blocking)
            third_party_command = self.getQueueCommand(job_container)
            output = util.runCommand(third_party_command,
                                     cwd=template['working_dir'])

            # Call derived methods to ascertain job status
            self.handleJobStatus(job_container, output)

            # Print results
            self.queueJobs(status_jobs=[job_container])

            # Delete this job and get the next list of jobs
            job_dag = job_container.getDAG()
            job_dag.delete_node(job_container)
            next_job_group = self.getNextJobGroup(job_dag)

            # run these new jobs
            self.queueJobs(run_jobs=next_job_group)
Example #16
0
    def runJobs(self, jobs):
        """ Queue list of jobs to run """
        for job_container in jobs:
            # store launched jobs so we can use it when TestHarness calls waitFinish.
            self.__jobs.add(job_container)

            # Augment queue parameters
            template = self.augmentQueueParamsBase(job_container)

            # Prepare the worker directory
            self.__copyFiles(job_container, template)

            # Write the execution file
            self.__prepareQueueScript(template)

            # Save template information
            test_unique = self.getUnique(job_container)
            self.putData(test_unique,
                         queue_script=template['queue_script'],
                         working_dir=template['working_dir'],
                         job_name=template['job_name'])

            # Get derived launch command and launch this job (blocking)
            third_party_command = self.getQueueCommand(job_container)
            output = util.runCommand(third_party_command, cwd=template['working_dir'])

            # Call derived methods to ascertain job status
            self.handleJobStatus(job_container, output)

            # Print results
            self.queueJobs(status_jobs=[job_container])

            # Delete this job and get the next list of jobs
            job_dag = job_container.getDAG()
            job_dag.delete_node(job_container)
            next_job_group = self.getNextJobGroup(job_dag)

            # run these new jobs
            self.queueJobs(run_jobs=next_job_group)
Example #17
0
    def run(self, job):
        """ execute qsub and return the launch id """
        template = self._augmentTemplate(job)
        tester = job.getTester()

        self.createQueueScript(job, template)

        command = ' '.join(['qsub', template['launch_script']])
        launch_results = util.runCommand(command, job.getTestDir())

        # List of files we need to clean up when we are done
        dirty_files = [template['launch_script'],
                       template['output']]

        self.addDirtyFiles(job, dirty_files)

        if launch_results.find('ERROR') != -1:
            # The executor job failed (so fail all jobs in this group)
            job_dag = job.getDAG()

            for other_job in [x for x in job_dag.topological_sort() if x != job]:
                other_job.clearCaveats()
                other_tester = other_job.getTester()
                other_tester.setStatus(other_tester.fail, 'launch failure')

            # This is _only_ to make the failed message more useful
            tester.specs['test_dir'] = ''
            tester.specs['command'] = command
            tester.setStatus(tester.fail, 'QSUB Group Failure')
            job.setOutput(launch_results)

        else:
            job.addMetaData(RunPBS={'ID' : launch_results,
                                    'QSUB_COMMAND' : command,
                                    'NCPUS' : template['mpi_procs'],
                                    'WALLTIME' : template['walltime'],
                                    'QSUB_OUTPUT' : template['output']})
            tester.setStatus(tester.no_status, 'LAUNCHING')
Example #18
0
    def processResults(self, moose_dir, options, output):
        FileTester.processResults(self, moose_dir, options, output)

        if self.getStatus() == self.bucket_fail or self.specs['skip_checks']:
            return output

        # Don't Run Exodiff on Scaled Tests
        if options.scaling and self.specs['scale_refine']:
            self.success_message = "SCALED"
            self.setStatus(self.getSuccessMessage(), self.bucket_success)
            return output

        # Make sure that all of the Exodiff files are actually available
        for file in self.specs['exodiff']:
            if not os.path.exists(os.path.join(self.specs['test_dir'], self.specs['gold_dir'], file)):
                output += "File Not Found: " + os.path.join(self.specs['test_dir'], self.specs['gold_dir'], file)
                self.setStatus('MISSING GOLD FILE', self.bucket_fail)
                break

        if self.getStatus() != self.bucket_fail:
            # Retrieve the commands
            commands = self.processResultsCommand(moose_dir, options)

            for command in commands:
                exo_output = util.runCommand(command)

                output += 'Running exodiff: ' + command + '\n' + exo_output + ' ' + ' '.join(self.specs['exodiff_opts'])

                if ('different' in exo_output or 'ERROR' in exo_output) and not "Files are the same" in exo_output:
                    self.setStatus('EXODIFF', self.bucket_diff)
                    break

        # If status is still pending, then it is a passing test
        if self.getStatus() == self.bucket_pending:
            self.setStatus(self.success_message, self.bucket_success)

        return output