Example #1
0
    def test_actions_after_a_critical_message(self):
        # the job process is running
        self.is_pid_running.return_value = True

        with patch("openquake.supervising." "supervisor.SupervisorLogMessageConsumer.run") as run:

            def run_(mc):
                record = logging.LogRecord("oq.job.123", logging.CRITICAL, "path", 42, "a msg", (), None)
                mc.log_callback(record)
                assert mc._stopped

            # the supervisor will receive a msg
            run.side_effect = run_

            supervisor.supervise(1, 123, timeout=0.1)

            # the job process is terminated
            self.assertEqual(1, self.terminate_job.call_count)
            self.assertEqual(((1,), {}), self.terminate_job.call_args)

            # stop time is recorded
            self.assertEqual(1, self.record_job_stop_time.call_count)
            self.assertEqual(((123,), {}), self.record_job_stop_time.call_args)

            # the cleanup is triggered
            self.assertEqual(1, self.cleanup_after_job.call_count)
            self.assertEqual(((123,), {}), self.cleanup_after_job.call_args)

            # the status in the job record is updated
            self.assertEqual(1, self.update_job_status_and_error_msg.call_count)
            self.assertEqual(((123, "failed", "a msg"), {}), self.update_job_status_and_error_msg.call_args)
Example #2
0
    def test_actions_after_job_process_failures(self):
        # the job process is running but has some failure counters above zero
        # shorten the delay to checking failure counters
        supervisor.SupervisorLogMessageConsumer.FCC_DELAY = 2
        self.is_pid_running.return_value = True
        self.get_job_status.return_value = 'running'

        stats.delete_job_counters(self.job.id)
        stats.incr_counter(self.job.id, "h", "a-failures")
        stats.incr_counter(self.job.id, "r", "b-failures")
        stats.incr_counter(self.job.id, "r", "b-failures")
        supervisor.supervise(1, self.job.id, timeout=0.1)

        # the job process is terminated
        self.assertEqual(1, self.terminate_job.call_count)
        self.assertEqual(((1,), {}), self.terminate_job.call_args)

        # stop time is recorded
        self.assertEqual(1, self.record_job_stop_time.call_count)
        self.assertEqual(
            ((self.job.id,), {}),
            self.record_job_stop_time.call_args)

        # the cleanup is triggered
        self.assertEqual(1, self.cleanup_after_job.call_count)
        self.assertEqual(
            ((self.job.id,), {}),
            self.cleanup_after_job.call_args)
Example #3
0
    def test_actions_after_job_process_crash(self):
        # the job process is *not* running
        self.is_pid_running.return_value = False
        # but the database record says it is
        self.get_job_status.return_value = 'running'

        supervisor.supervise(1, self.job.id, timeout=0.1)

        # stop time is recorded
        self.assertEqual(1, self.record_job_stop_time.call_count)
        self.assertEqual(
            ((self.job.id,), {}),
            self.record_job_stop_time.call_args)

        # the cleanup is triggered
        self.assertEqual(1, self.cleanup_after_job.call_count)
        self.assertEqual(
            ((self.job.id,), {}),
            self.cleanup_after_job.call_args)

        # the status in the job record is updated
        self.assertEqual(1, self.update_job_status_and_error_msg.call_count)
        self.assertEqual(
            ((self.job.id,),
             {'error_msg': 'job process 1 crashed or terminated'}),
            self.update_job_status_and_error_msg.call_args)
Example #4
0
    def test_actions_after_a_critical_message(self):
        # the job process is running
        self.is_pid_running.return_value = True

        with patch('openquake.supervising.' \
                   'supervisor.SupervisorLogMessageConsumer.run') as run:

            def run_(mc):
                record = logging.LogRecord('oq.job.123', logging.CRITICAL,
                                           'path', 42, 'a msg', (), None)
                mc.log_callback(record)
                assert mc._stopped

            # the supervisor will receive a msg
            run.side_effect = run_

            supervisor.supervise(1, 123, timeout=0.1)

            # the job process is terminated
            self.assertEqual(1, self.terminate_job.call_count)
            self.assertEqual(((1,), {}), self.terminate_job.call_args)

            # stop time is recorded
            self.assertEqual(1, self.record_job_stop_time.call_count)
            self.assertEqual(((123,), {}), self.record_job_stop_time.call_args)

            # the cleanup is triggered
            self.assertEqual(1, self.cleanup_after_job.call_count)
            self.assertEqual(((123,), {}), self.cleanup_after_job.call_args)

            # the status in the job record is updated
            self.assertEqual(1,
                             self.update_job_status_and_error_msg.call_count)
            self.assertEqual(((123, 'failed', 'a msg'), {}),
                             self.update_job_status_and_error_msg.call_args)
Example #5
0
def main():  # pylint: disable=C0111
    os.environ['DJANGO_SETTINGS_MODULE'] = 'openquake.settings'

    from openquake.supervising import supervisor

    job_id = int(sys.argv[1])
    pid = int(sys.argv[2])
    supervisor.supervise(pid, job_id)
def main():  # pylint: disable=C0111
    os.environ['DJANGO_SETTINGS_MODULE'] = 'openquake.settings'

    from openquake.supervising import supervisor

    job_id = int(sys.argv[1])
    pid = int(sys.argv[2])
    supervisor.supervise(pid, job_id)
Example #7
0
    def test_actions_after_job_process_termination(self):
        # the job process is *not* running
        self.is_pid_running.return_value = False
        self.get_job_status.return_value = 'succeeded'

        supervisor.supervise(1, 123, timeout=0.1)

        # stop time is recorded
        self.assertEqual(1, self.record_job_stop_time.call_count)
        self.assertEqual(((123,), {}), self.record_job_stop_time.call_args)

        # the cleanup is triggered
        self.assertEqual(1, self.cleanup_after_job.call_count)
        self.assertEqual(((123,), {}), self.cleanup_after_job.call_args)
Example #8
0
    def test_actions_after_job_process_termination(self):
        # the job process is *not* running
        self.is_pid_running.return_value = False
        self.get_job_status.return_value = "succeeded"

        supervisor.supervise(1, 123, timeout=0.1)

        # stop time is recorded
        self.assertEqual(1, self.record_job_stop_time.call_count)
        self.assertEqual(((123,), {}), self.record_job_stop_time.call_args)

        # the cleanup is triggered
        self.assertEqual(1, self.cleanup_after_job.call_count)
        self.assertEqual(((123,), {}), self.cleanup_after_job.call_args)
Example #9
0
    def test_actions_after_job_process_termination(self):
        # the job process is *not* running
        self.is_pid_running.return_value = False
        self.get_job_status.return_value = 'succeeded'

        supervisor.supervise(1, 123)

        # the cleanup is triggered
        self.assertEqual(1, self.cleanup_after_job.call_count)
        self.assertEqual(((123,), {}), self.cleanup_after_job.call_args)

        # the outcome is signalled
        self.assertEqual(1, self.signal_job_outcome.call_count)
        self.assertEqual(((123, 'succeeded'), {}),
                            self.signal_job_outcome.call_args)
Example #10
0
    def test_actions_after_job_process_crash(self):
        # the job process is *not* running
        self.is_pid_running.return_value = False
        # but the database record says it is
        self.get_job_status.return_value = 'running'

        supervisor.supervise(1, 123)

        # the cleanup is triggered
        self.assertEqual(1, self.cleanup_after_job.call_count)
        self.assertEqual(((123,), {}), self.cleanup_after_job.call_args)

        # the outcome is signalled
        self.assertEqual(1, self.signal_job_outcome.call_count)
        self.assertEqual(((123, 'failed'), {}),
                            self.signal_job_outcome.call_args)

        # the status in the job record is updated
        self.assertEqual(1,
                            self.update_job_status_and_error_msg.call_count)
        self.assertEqual(((123, 'failed', 'crash'), {}),
                            self.update_job_status_and_error_msg.call_args)
Example #11
0
    def test_actions_after_job_process_crash(self):
        # the job process is *not* running
        self.is_pid_running.return_value = False
        # but the database record says it is
        self.get_job_status.return_value = 'running'

        supervisor.supervise(1, 123, timeout=0.1)

        # stop time is recorded
        self.assertEqual(1, self.record_job_stop_time.call_count)
        self.assertEqual(((123,), {}), self.record_job_stop_time.call_args)

        # the cleanup is triggered
        self.assertEqual(1, self.cleanup_after_job.call_count)
        self.assertEqual(((123,), {}), self.cleanup_after_job.call_args)

        # the status in the job record is updated
        self.assertEqual(1,
                            self.update_job_status_and_error_msg.call_count)
        self.assertEqual(
            ((123, 'failed', 'job process 1 crashed or terminated'), {}),
            self.update_job_status_and_error_msg.call_args)
Example #12
0
    def test_actions_after_a_critical_message(self):
        # the job process is running
        self.is_pid_running.return_value = True

        with patch('openquake.supervising.' \
                   'supervisor.SupervisorLogMessageConsumer.run') as run:

            def run_(mc):
                while True:
                    try:
                        mc.message_callback(amqp.Message(body='a msg'))
                    except StopIteration:
                        break

            # the supervisor will receive a msg
            run.side_effect = run_

            supervisor.supervise(1, 123)

            # the job process is terminated
            self.assertEqual(1, self.terminate_job.call_count)
            self.assertEqual(((1,), {}), self.terminate_job.call_args)

            # the cleanup is triggered
            self.assertEqual(1, self.cleanup_after_job.call_count)
            self.assertEqual(((123,), {}), self.cleanup_after_job.call_args)

            # the outcome is signalled
            self.assertEqual(1, self.signal_job_outcome.call_count)
            self.assertEqual(((123, 'failed'), {}),
                             self.signal_job_outcome.call_args)

            # the status in the job record is updated
            self.assertEqual(1,
                             self.update_job_status_and_error_msg.call_count)
            self.assertEqual(((123, 'failed', 'a msg'), {}),
                             self.update_job_status_and_error_msg.call_args)
Example #13
0
    def test_actions_after_job_process_failures(self):
        # the job process is running but has some failure counters above zero
        # shorten the delay to checking failure counters
        supervisor.SupervisorLogMessageConsumer.FCC_DELAY = 2
        self.is_pid_running.return_value = True
        self.get_job_status.return_value = 'running'

        stats.delete_job_counters(123)
        stats.incr_counter(123, "h", "a:failed")
        stats.incr_counter(123, "r", "b:failed")
        stats.incr_counter(123, "r", "b:failed")
        supervisor.supervise(1, 123, timeout=0.1)

        # the job process is terminated
        self.assertEqual(1, self.terminate_job.call_count)
        self.assertEqual(((1,), {}), self.terminate_job.call_args)

        # stop time is recorded
        self.assertEqual(1, self.record_job_stop_time.call_count)
        self.assertEqual(((123,), {}), self.record_job_stop_time.call_args)

        # the cleanup is triggered
        self.assertEqual(1, self.cleanup_after_job.call_count)
        self.assertEqual(((123,), {}), self.cleanup_after_job.call_args)
Example #14
0
            _do_run_hazard(job, exports)
        except Exception, ex:
            logs.LOG.critical("Calculation failed with exception: '%s'"
                              % str(ex))
            raise
        finally:
            job.is_running = False
            job.save()
        return

    supervisor_pid = os.fork()
    if not supervisor_pid:
        # supervisor process
        logs.set_logger_level(logs.logging.root, log_level)
        # TODO: deal with KVS garbage collection
        supervisor.supervise(job_pid, job.id, log_file=log_file)
        return

    # parent process

    # ignore Ctrl-C as well as supervisor process does. thus only
    # job executor terminates on SIGINT
    supervisor.ignore_sigint()
    # wait till both child processes are done
    os.waitpid(job_pid, 0)
    os.waitpid(supervisor_pid, 0)

    # Refresh the job record, since the forked processes are going to modify
    # job state.
    return models.OqJob.objects.get(id=job.id)
Example #15
0
            job.save()
            raise
        else:
            job.status = 'succeeded'
            job.save()
        return

    supervisor_pid = os.fork()
    if not supervisor_pid:
        # supervisor process
        logs.set_logger_level(logs.logging.root, log_level)
        supervisor_pid = os.getpid()
        job.supervisor_pid = supervisor_pid
        job.job_pid = job_pid
        job.save()
        supervisor.supervise(job_pid, job.id, log_file=log_file)
        return

    # parent process

    # ignore Ctrl-C as well as supervisor process does. thus only
    # job executor terminates on SIGINT
    supervisor.ignore_sigint()
    # wait till both child processes are done
    os.waitpid(job_pid, 0)
    os.waitpid(supervisor_pid, 0)

    return job


def _launch_job(job_ctxt, sections):
Example #16
0
            LOG.critical("Job failed with exception: '%s'" % str(ex))
            a_job.set_status('failed')
            raise
        else:
            a_job.set_status('succeeded')
        return

    supervisor_pid = os.fork()
    if not supervisor_pid:
        # supervisor process
        supervisor_pid = os.getpid()
        job = OqJob.objects.get(id=a_job.job_id)
        job.supervisor_pid = supervisor_pid
        job.job_pid = job_pid
        job.save()
        supervisor.supervise(job_pid, a_job.job_id)
        return

    # parent process

    # ignore Ctrl-C as well as supervisor process does. thus only
    # job executor terminates on SIGINT
    supervisor.ignore_sigint()
    # wait till both child processes are done
    os.waitpid(job_pid, 0)
    os.waitpid(supervisor_pid, 0)


def parse_config_file(config_file):
    """
    We have a single configuration file which may contain a risk section and
Example #17
0
            job.save()
            raise
        else:
            job.status = 'succeeded'
            job.save()
        return

    supervisor_pid = os.fork()
    if not supervisor_pid:
        # supervisor process
        logs.set_logger_level(logs.logging.root, log_level)
        supervisor_pid = os.getpid()
        job.supervisor_pid = supervisor_pid
        job.job_pid = job_pid
        job.save()
        supervisor.supervise(job_pid, job.id)
        return

    # parent process

    # ignore Ctrl-C as well as supervisor process does. thus only
    # job executor terminates on SIGINT
    supervisor.ignore_sigint()
    # wait till both child processes are done
    os.waitpid(job_pid, 0)
    os.waitpid(supervisor_pid, 0)

    return job


def _launch_job(job_ctxt, sections):
Example #18
0
            calculation.status = 'failed'
            calculation.save()
            raise
        else:
            calculation.status = 'succeeded'
            calculation.save()
        return

    supervisor_pid = os.fork()
    if not supervisor_pid:
        # supervisor process
        supervisor_pid = os.getpid()
        calculation.supervisor_pid = supervisor_pid
        calculation.job_pid = calc_pid
        calculation.save()
        supervisor.supervise(calc_pid, calculation.id)
        return

    # parent process

    # ignore Ctrl-C as well as supervisor process does. thus only
    # job executor terminates on SIGINT
    supervisor.ignore_sigint()
    # wait till both child processes are done
    os.waitpid(calc_pid, 0)
    os.waitpid(supervisor_pid, 0)

    return calculation


def _launch_calculation(calc_proxy, sections):