Ejemplo n.º 1
0
    def test_actions_after_job_process_crash(self):
        # the job process is *not* running
        self.is_pid_running.return_value = False
        # but the database record says it is
        self.get_job_status.return_value = 'running'

        supervisor.supervise(1, self.job.id, timeout=0.1)

        # stop time is recorded
        self.assertEqual(1, self.record_job_stop_time.call_count)
        self.assertEqual(
            ((self.job.id,), {}),
            self.record_job_stop_time.call_args)

        # the cleanup is triggered
        self.assertEqual(1, self.cleanup_after_job.call_count)
        self.assertEqual(
            ((self.job.id,), {}),
            self.cleanup_after_job.call_args)

        # the status in the job record is updated
        self.assertEqual(1, self.update_job_status_and_error_msg.call_count)
        self.assertEqual(
            ((self.job.id,),
             {'error_msg': 'job process 1 crashed or terminated'}),
            self.update_job_status_and_error_msg.call_args)
Ejemplo n.º 2
0
    def test_actions_after_job_process_failures(self):
        # the job process is running but has some failure counters above zero
        # shorten the delay to checking failure counters
        supervisor.SupervisorLogMessageConsumer.FCC_DELAY = 2
        self.is_pid_running.return_value = True
        self.get_job_status.return_value = 'running'

        stats.delete_job_counters(self.job.id)
        stats.incr_counter(self.job.id, "h", "a:failed")
        stats.incr_counter(self.job.id, "r", "b:failed")
        stats.incr_counter(self.job.id, "r", "b:failed")
        supervisor.supervise(1, self.job.id, timeout=0.1)

        # the job process is terminated
        self.assertEqual(1, self.terminate_job.call_count)
        self.assertEqual(((1, ), {}), self.terminate_job.call_args)

        # stop time is recorded
        self.assertEqual(1, self.record_job_stop_time.call_count)
        self.assertEqual(((self.job.id, ), {}),
                         self.record_job_stop_time.call_args)

        # the cleanup is triggered
        self.assertEqual(1, self.cleanup_after_job.call_count)
        self.assertEqual(((self.job.id, terminate), {}),
                         self.cleanup_after_job.call_args)
Ejemplo n.º 3
0
    def test_actions_after_job_process_failures(self):
        # the job process is running but has some failure counters above zero
        # shorten the delay to checking failure counters
        supervisor.SupervisorLogMessageConsumer.FCC_DELAY = 2
        self.is_pid_running.return_value = True
        self.get_job_status.return_value = 'running'

        stats.delete_job_counters(self.job.id)
        stats.incr_counter(self.job.id, "h", "a:failed")
        stats.incr_counter(self.job.id, "r", "b:failed")
        stats.incr_counter(self.job.id, "r", "b:failed")
        supervisor.supervise(1, self.job.id, timeout=0.1)

        # the job process is terminated
        self.assertEqual(1, self.terminate_job.call_count)
        self.assertEqual(((1,), {}), self.terminate_job.call_args)

        # stop time is recorded
        self.assertEqual(1, self.record_job_stop_time.call_count)
        self.assertEqual(
            ((self.job.id,), {}),
            self.record_job_stop_time.call_args)

        # the cleanup is triggered
        self.assertEqual(1, self.cleanup_after_job.call_count)
        self.assertEqual(
            ((self.job.id,), {}),
            self.cleanup_after_job.call_args)
Ejemplo n.º 4
0
    def test_actions_after_job_process_termination(self):
        # the job process is *not* running
        self.is_pid_running.return_value = False
        self.get_job_status.return_value = 'succeeded'

        supervisor.supervise(1, self.job.id, timeout=0.1)

        # stop time is recorded
        self.assertEqual(1, self.record_job_stop_time.call_count)
        self.assertEqual(((self.job.id, ), {}),
                         self.record_job_stop_time.call_args)

        # the cleanup is triggered
        self.assertEqual(1, self.cleanup_after_job.call_count)
        self.assertEqual(((self.job.id, terminate), {}),
                         self.cleanup_after_job.call_args)
Ejemplo n.º 5
0
    def test_actions_after_a_critical_message(self):
        # the job process is running
        self.is_pid_running.return_value = True

        with patch('openquake.engine.supervising.' \
                   'supervisor.SupervisorLogMessageConsumer.run') as run:

            def run_(mc):
                record = logging.LogRecord(
                    'oq.job.%s' % self.job.id,
                    logging.CRITICAL,
                    'path',
                    42,
                    'a msg',
                    (),
                    None)
                mc.log_callback(record)
                assert mc._stopped

            # the supervisor will receive a msg
            run.side_effect = run_

            supervisor.supervise(1, self.job.id, timeout=0.1)

            # the job process is terminated
            self.assertEqual(1, self.terminate_job.call_count)
            self.assertEqual(((1,), {}), self.terminate_job.call_args)

            # stop time is recorded
            self.assertEqual(1, self.record_job_stop_time.call_count)
            self.assertEqual(
                ((self.job.id,), {}),
                self.record_job_stop_time.call_args)

            # the cleanup is triggered
            self.assertEqual(1, self.cleanup_after_job.call_count)
            self.assertEqual(
                ((self.job.id,), {}),
                self.cleanup_after_job.call_args)

            # the status in the job record is updated
            self.assertEqual(
                1,
                self.update_job_status_and_error_msg.call_count)
            self.assertEqual(
                ((self.job.id, 'a msg'), {}),
                self.update_job_status_and_error_msg.call_args)
Ejemplo n.º 6
0
    def test_actions_after_job_process_termination(self):
        # the job process is *not* running
        self.is_pid_running.return_value = False
        self.get_job_status.return_value = 'succeeded'

        supervisor.supervise(1, self.job.id, timeout=0.1)

        # stop time is recorded
        self.assertEqual(1, self.record_job_stop_time.call_count)
        self.assertEqual(
            ((self.job.id,), {}),
            self.record_job_stop_time.call_args)

        # the cleanup is triggered
        self.assertEqual(1, self.cleanup_after_job.call_count)
        self.assertEqual(
            ((self.job.id,), {}),
            self.cleanup_after_job.call_args)
Ejemplo n.º 7
0
    def test_actions_after_job_process_crash(self):
        # the job process is *not* running
        self.is_pid_running.return_value = False
        # but the database record says it is
        self.get_job_status.return_value = 'running'

        supervisor.supervise(1, self.job.id, timeout=0.1)

        # stop time is recorded
        self.assertEqual(1, self.record_job_stop_time.call_count)
        self.assertEqual(((self.job.id, ), {}),
                         self.record_job_stop_time.call_args)

        # the cleanup is triggered
        self.assertEqual(1, self.cleanup_after_job.call_count)
        self.assertEqual(((self.job.id, terminate), {}),
                         self.cleanup_after_job.call_args)

        # the status in the job record is updated
        self.assertEqual(1, self.update_job_status.call_count)
        self.assertEqual(((self.job.id, ), {}),
                         self.update_job_status.call_args)
Ejemplo n.º 8
0
    def test_actions_after_a_critical_message(self):
        # the job process is running
        self.is_pid_running.return_value = True

        with patch('openquake.engine.supervising.'
                   'supervisor.SupervisorLogMessageConsumer.run') as run:

            def run_(mc):
                record = logging.LogRecord('oq.job.%s' % self.job.id,
                                           logging.CRITICAL, 'path', 42,
                                           'a msg', (), None)
                mc.log_callback(record)
                assert mc._stopped

            # the supervisor will receive a msg
            run.side_effect = run_

            supervisor.supervise(1, self.job.id, timeout=0.1)

            # the job process is terminated
            self.assertEqual(1, self.terminate_job.call_count)
            self.assertEqual(((1, ), {}), self.terminate_job.call_args)

            # stop time is recorded
            self.assertEqual(1, self.record_job_stop_time.call_count)
            self.assertEqual(((self.job.id, ), {}),
                             self.record_job_stop_time.call_args)

            # the cleanup is triggered
            self.assertEqual(1, self.cleanup_after_job.call_count)
            self.assertEqual(((self.job.id, terminate), {}),
                             self.cleanup_after_job.call_args)

            # the status in the job record is updated
            self.assertEqual(1, self.update_job_status.call_count)
            self.assertEqual(((self.job.id, ), {}),
                             self.update_job_status.call_args)
Ejemplo n.º 9
0
            _do_run_calc(job, exports, calc, job_type)
        except Exception, ex:
            logs.LOG.critical("Calculation failed with exception: '%s'"
                              % str(ex))
            raise
        finally:
            job.is_running = False
            job.save()
        return

    supervisor_pid = os.fork()
    if not supervisor_pid:
        # supervisor process
        logs.set_logger_level(logs.logging.root, log_level)
        # TODO: deal with KVS garbage collection
        supervisor.supervise(job_pid, job.id, log_file=log_file)
        return

    # parent process

    # ignore Ctrl-C as well as supervisor process does. thus only
    # job executor terminates on SIGINT
    supervisor.ignore_sigint()
    # wait till both child processes are done
    os.waitpid(job_pid, 0)
    os.waitpid(supervisor_pid, 0)

    # Refresh the job record, since the forked processes are going to modify
    # job state.
    return models.OqJob.objects.get(id=job.id)
Ejemplo n.º 10
0
                _job_exec(job, log_level, exports, job_type, calc)
            except Exception, ex:
                logs.LOG.critical("Calculation failed with exception: '%s'" %
                                  str(ex))
                raise
            finally:
                job.is_running = False
                job.save()
            return

        supervisor_pid = os.fork()
        if not supervisor_pid:
            # supervisor process
            logs.set_logger_level(logs.logging.root, log_level)
            # TODO: deal with KVS garbage collection
            supervisor.supervise(job_pid, job.id, log_file=log_file)
            return

        # parent process

        # ignore Ctrl-C as well as supervisor process does. thus only
        # job executor terminates on SIGINT
        supervisor.ignore_sigint()
        # wait till both child processes are done
        os.waitpid(job_pid, 0)
        os.waitpid(supervisor_pid, 0)
    else:
        try:
            _job_exec(job, log_level, exports, job_type, calc)
        except Exception, ex:
            logs.LOG.critical("Calculation failed with exception: '%s'" %