def __init__(self, instance): from dagster.daemon.controller import get_daemon_status self._daemon_statuses = { DauphinDaemonType.SCHEDULER.value: get_daemon_status( # pylint: disable=no-member instance, DaemonType.SCHEDULER), DauphinDaemonType.SENSOR.value: get_daemon_status( # pylint: disable=no-member instance, DaemonType.SENSOR), DauphinDaemonType.QUEUED_RUN_COORDINATOR.value: get_daemon_status( # pylint: disable=no-member instance, DaemonType.QUEUED_RUN_COORDINATOR), }
def __init__(self, instance): super().__init__() self._daemon_statuses = { DaemonType.SCHEDULER.value: get_daemon_status( # pylint: disable=no-member instance, DaemonType.SCHEDULER), DaemonType.SENSOR.value: get_daemon_status( # pylint: disable=no-member instance, DaemonType.SENSOR), DaemonType.QUEUED_RUN_COORDINATOR.value: get_daemon_status( # pylint: disable=no-member instance, DaemonType.QUEUED_RUN_COORDINATOR), }
def resolve_allDaemonStatuses(self, _graphene_info): return [ GrapheneDaemonStatus( get_daemon_status(self._instance, daemon_type, ignore_errors=True)) for daemon_type in self._instance.get_required_daemon_types() ]
def test_multiple_error_daemon(monkeypatch): with instance_for_test() as instance: from dagster.daemon.daemon import SensorDaemon def run_loop_error(_, _instance, _workspace): # ?message stack cls_name cause" yield SerializableErrorInfo("foobar", None, None, None) yield SerializableErrorInfo("bizbuz", None, None, None) while True: yield time.sleep(0.5) monkeypatch.setattr(SensorDaemon, "core_loop", run_loop_error) init_time = pendulum.now("UTC") heartbeat_interval_seconds = 1 with daemon_controller_from_instance( instance, heartbeat_interval_seconds=heartbeat_interval_seconds ) as controller: while True: now = pendulum.now("UTC") if all_daemons_live( instance, heartbeat_interval_seconds=heartbeat_interval_seconds): # Despite error, daemon should still be running controller.check_daemon_threads() controller.check_daemon_heartbeats() status = get_daemon_status(instance, SensorDaemon.daemon_type(), now.float_timestamp) if status.healthy == False and len( status.last_heartbeat.errors) == 2: assert status.last_heartbeat.errors[0].message.strip( ) == "bizbuz" assert status.last_heartbeat.errors[1].message.strip( ) == "foobar" break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for heartbeat error") time.sleep(0.5)
def test_error_daemon(monkeypatch): with instance_for_test(overrides={}) as instance: from dagster.daemon.daemon import SensorDaemon def run_iteration_error(_): raise DagsterInvariantViolationError("foobar") monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) controller = DagsterDaemonController(instance) init_time = pendulum.now("UTC") controller.run_iteration(init_time) status = get_daemon_status(instance, SensorDaemon.daemon_type(), init_time.float_timestamp) assert status.healthy == False assert (status.last_heartbeat.error.message.strip() == "dagster.core.errors.DagsterInvariantViolationError: foobar")
def test_thread_die_daemon(monkeypatch): with instance_for_test(overrides={}) as instance: from dagster.daemon.daemon import SchedulerDaemon, SensorDaemon iteration_ran = {"ran": False} def run_loop_error(_, _instance, _workspace): iteration_ran["ran"] = True raise KeyboardInterrupt yield # pylint: disable=unreachable monkeypatch.setattr(SensorDaemon, "core_loop", run_loop_error) heartbeat_interval_seconds = 1 init_time = pendulum.now("UTC") with daemon_controller_from_instance( instance, workspace_load_target=EmptyWorkspaceTarget(), heartbeat_interval_seconds=heartbeat_interval_seconds, ) as controller: while True: now = pendulum.now("UTC") status = get_daemon_status( instance, SchedulerDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) if iteration_ran["ran"] and status.healthy: try: controller.check_daemon_threads() # Should eventually throw since the sensor thread is interrupted except Exception as e: assert ( "Stopping dagster-daemon process since the following threads are no longer running: ['SENSOR']" in str(e) ) break if (now - init_time).total_seconds() > 20: raise Exception("timed out waiting for check_daemons to fail") time.sleep(0.5)
def test_multiple_error_daemon(monkeypatch): with instance_for_test(overrides={}) as instance: from dagster.daemon.daemon import SensorDaemon def run_iteration_error(_): # ?message stack cls_name cause" yield SerializableErrorInfo("foobar", None, None, None) yield SerializableErrorInfo("bizbuz", None, None, None) monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) controller = DagsterDaemonController(instance) init_time = pendulum.now("UTC") controller.run_iteration(init_time) status = get_daemon_status(instance, SensorDaemon.daemon_type(), init_time.float_timestamp) assert status.healthy == False assert len(status.last_heartbeat.errors) == 2 assert status.last_heartbeat.errors[0].message.strip() == "foobar" assert status.last_heartbeat.errors[1].message.strip() == "bizbuz"
def test_error_daemon(monkeypatch): with instance_for_test() as instance: from dagster.daemon.daemon import SensorDaemon def run_iteration_error(_, _instance, _daemon_shutdown_event, _grpc_server_registry): raise DagsterInvariantViolationError("foobar") yield # pylint: disable=unreachable monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) init_time = pendulum.now("UTC") with daemon_controller_from_instance( instance, wait_for_processes_on_exit=True) as controller: while True: now = pendulum.now("UTC") if all_daemons_live(instance): # Despite error, daemon should still be running controller.check_daemons() status = get_daemon_status(instance, SensorDaemon.daemon_type(), now.float_timestamp) assert status.healthy == False assert len(status.last_heartbeat.errors) == 1 assert ( status.last_heartbeat.errors[0].message.strip() == "dagster.core.errors.DagsterInvariantViolationError: foobar" ) assert not all_daemons_healthy( instance, curr_time_seconds=now.float_timestamp) assert all_daemons_live( instance, curr_time_seconds=now.float_timestamp) break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for heartbeat error") time.sleep(0.5)
def test_multiple_error_daemon(monkeypatch): with instance_for_test() as instance: from dagster.daemon.daemon import SensorDaemon def run_iteration_error(_, _instance, _daemon_shutdown_event_, _grpc_server_registry): # ?message stack cls_name cause" yield SerializableErrorInfo("foobar", None, None, None) yield SerializableErrorInfo("bizbuz", None, None, None) monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) init_time = pendulum.now("UTC") with daemon_controller_from_instance( instance, wait_for_processes_on_exit=True) as controller: while True: now = pendulum.now("UTC") if all_daemons_live(instance): # Despite error, daemon should still be running controller.check_daemons() status = get_daemon_status(instance, SensorDaemon.daemon_type(), now.float_timestamp) if status.healthy == False: assert len(status.last_heartbeat.errors) == 2 assert status.last_heartbeat.errors[0].message.strip( ) == "foobar" assert status.last_heartbeat.errors[1].message.strip( ) == "bizbuz" break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for heartbeat error") time.sleep(0.5)
def test_thread_die_daemon(monkeypatch): with instance_for_test(overrides={}) as instance: from dagster.daemon.daemon import SchedulerDaemon, SensorDaemon iteration_ran = {"ran": False} def run_iteration_error(_, _instance, _daemon_shutdown_event, _grpc_server_registry): iteration_ran["ran"] = True raise KeyboardInterrupt yield # pylint: disable=unreachable monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) init_time = pendulum.now("UTC") with daemon_controller_from_instance( instance, wait_for_processes_on_exit=True) as controller: while True: now = pendulum.now("UTC") status = get_daemon_status(instance, SchedulerDaemon.daemon_type(), now.float_timestamp) if iteration_ran["ran"] and status.healthy: try: controller.check_daemons( ) # Should eventually throw since the sensor thread is interrupted except Exception as e: # pylint: disable=broad-except assert ( "Stopping dagster-daemon process since the following threads are no longer sending heartbeats: ['SENSOR']" in str(e)) break if (now - init_time).total_seconds() > 20: raise Exception( "timed out waiting for check_daemons to fail") time.sleep(0.5)
def test_error_daemon(monkeypatch): with instance_for_test(overrides={}) as instance: from dagster.daemon.daemon import SensorDaemon def run_iteration_error(_): raise DagsterInvariantViolationError("foobar") yield # pylint: disable=unreachable monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) controller = DagsterDaemonController(instance) init_time = pendulum.now("UTC") controller.run_iteration(init_time) status = get_daemon_status(instance, SensorDaemon.daemon_type(), init_time.float_timestamp) assert status.healthy == False assert len(status.last_heartbeat.errors) == 1 assert (status.last_heartbeat.errors[0].message.strip() == "dagster.core.errors.DagsterInvariantViolationError: foobar") assert not all_daemons_healthy( instance, curr_time_seconds=init_time.float_timestamp) assert all_daemons_live(instance, curr_time_seconds=init_time.float_timestamp)
def debug_heartbeat_dump_command(): with DagsterInstance.get() as instance: for daemon_type in instance.get_required_daemon_types(): click.echo(get_daemon_status(instance, daemon_type))
def test_warn_multiple_daemons(capsys): from dagster.daemon.daemon import SensorDaemon with instance_for_test() as instance: init_time = pendulum.now("UTC") heartbeat_interval_seconds = 1 with daemon_controller_from_instance( instance, heartbeat_interval_seconds=heartbeat_interval_seconds): while True: now = pendulum.now("UTC") if all_daemons_live( instance, heartbeat_interval_seconds=heartbeat_interval_seconds): captured = capsys.readouterr() assert "Taking over from another SENSOR daemon process" not in captured.out break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for daemon status") time.sleep(0.5) capsys.readouterr() init_time = pendulum.now("UTC") status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) last_heartbeat_time = status.last_heartbeat.timestamp # No warning when a second controller starts up again with daemon_controller_from_instance( instance, heartbeat_interval_seconds=heartbeat_interval_seconds): while True: now = pendulum.now("UTC") status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) if status.last_heartbeat and status.last_heartbeat.timestamp != last_heartbeat_time: captured = capsys.readouterr() assert "Taking over from another SENSOR daemon process" not in captured.out break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for new daemon status") time.sleep(0.5) status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) last_heartbeat_time = status.last_heartbeat.timestamp # Starting up a controller while one is running produces the warning though with daemon_controller_from_instance( instance, heartbeat_interval_seconds=heartbeat_interval_seconds): # Wait for heartbeats while two controllers are running at once and there will # be a warning init_time = pendulum.now("UTC") while True: now = pendulum.now("UTC") captured = capsys.readouterr() if "Taking over from another SENSOR daemon process" in captured.out: break if (now - init_time).total_seconds() > 60: raise Exception("timed out waiting for heartbeats") time.sleep(5)
def test_error_daemon(monkeypatch): with instance_for_test() as instance: from dagster.daemon.daemon import SensorDaemon should_raise_errors = True def run_iteration_error(_, _instance, _workspace): if should_raise_errors: raise DagsterInvariantViolationError("foobar") yield monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) heartbeat_interval_seconds = 1 gen_daemons = lambda instance: [SensorDaemon(interval_seconds=1)] init_time = pendulum.now("UTC") with daemon_controller_from_instance( instance, heartbeat_interval_seconds=heartbeat_interval_seconds, gen_daemons=gen_daemons, error_interval_seconds=10, ) as controller: while True: now = pendulum.now("UTC") if get_daemon_status( instance, SensorDaemon.daemon_type(), heartbeat_interval_seconds=heartbeat_interval_seconds, ignore_errors=True, ).healthy: # Despite error, daemon should still be running controller.check_daemon_threads() controller.check_daemon_heartbeats() status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) assert status.healthy == False # Errors build up until they hit 5 if len(status.last_heartbeat.errors) == 5: assert ( status.last_heartbeat.errors[0].message.strip() == "dagster.core.errors.DagsterInvariantViolationError: foobar" ) assert not get_daemon_status( instance, SensorDaemon.daemon_type(), curr_time_seconds=now.float_timestamp, heartbeat_interval_seconds= heartbeat_interval_seconds, ).healthy assert get_daemon_status( instance, SensorDaemon.daemon_type(), curr_time_seconds=now.float_timestamp, heartbeat_interval_seconds= heartbeat_interval_seconds, ignore_errors=True, ).healthy time.sleep(3) status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds= heartbeat_interval_seconds, ) # Error count does not rise above 5 assert len(status.last_heartbeat.errors) == 5 break if (now - init_time).total_seconds() > 15: raise Exception("timed out waiting for heartbeat error") time.sleep(0.5) # Once the sensor no longer raises errors, they should return to 0 once # enough time passes should_raise_errors = False init_time = pendulum.now("UTC") while True: now = pendulum.now("UTC") status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) # Error count does not rise above 5 if len(status.last_heartbeat.errors) == 0: break if (now - init_time).total_seconds() > 15: raise Exception( "timed out waiting for hearrteat errors to return to 0" ) time.sleep(0.5)
def resolve_daemonStatus(self, _graphene_info, daemon_type): check.str_param(daemon_type, "daemon_type") return GrapheneDaemonStatus( get_daemon_status(self._instance, daemon_type))
def test_error_daemon(monkeypatch): with instance_for_test() as instance: from dagster.daemon.daemon import SensorDaemon should_raise_errors = True error_count = {"count": 0} def run_loop_error(_, _instance, _workspace): if should_raise_errors: time.sleep(0.5) error_count["count"] = error_count["count"] + 1 raise DagsterInvariantViolationError("foobar:" + str(error_count["count"])) while True: yield time.sleep(0.5) def _get_error_number(error): error_message = error.message.strip() return int(error_message.split("foobar:")[1]) monkeypatch.setattr(SensorDaemon, "core_loop", run_loop_error) heartbeat_interval_seconds = 1 gen_daemons = lambda instance: [SensorDaemon()] init_time = pendulum.now("UTC") with daemon_controller_from_instance( instance, workspace_load_target=EmptyWorkspaceTarget(), heartbeat_interval_seconds=heartbeat_interval_seconds, gen_daemons=gen_daemons, error_interval_seconds=10, ) as controller: while True: now = pendulum.now("UTC") if get_daemon_status( instance, SensorDaemon.daemon_type(), heartbeat_interval_seconds=heartbeat_interval_seconds, ignore_errors=True, ).healthy: # Despite error, daemon should still be running controller.check_daemon_threads() controller.check_daemon_heartbeats() status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) assert status.healthy == False # Errors build up until there are > 5, then pull off the last if len(status.last_heartbeat.errors) >= 5: first_error_number = _get_error_number(status.last_heartbeat.errors[0]) if first_error_number > 5: # Verify error numbers decrease consecutively assert [ _get_error_number(error) for error in status.last_heartbeat.errors ] == list(range(first_error_number, first_error_number - 5, -1)) assert not get_daemon_status( instance, SensorDaemon.daemon_type(), curr_time_seconds=now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ).healthy assert get_daemon_status( instance, SensorDaemon.daemon_type(), curr_time_seconds=now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ignore_errors=True, ).healthy time.sleep(3) status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) # Error count does not rise above 5, continues to increase assert len(status.last_heartbeat.errors) == 5 new_first_error_number = _get_error_number( status.last_heartbeat.errors[0] ) assert new_first_error_number > first_error_number break if (now - init_time).total_seconds() > 15: raise Exception("timed out waiting for heartbeat error") time.sleep(0.5) # Once the sensor no longer raises errors, they should return to 0 once # enough time passes should_raise_errors = False init_time = pendulum.now("UTC") while True: now = pendulum.now("UTC") status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) # Error count does not rise above 5 if len(status.last_heartbeat.errors) == 0: break if (now - init_time).total_seconds() > 15: raise Exception("timed out waiting for hearrteat errors to return to 0") time.sleep(0.5)
def resolve_allDaemonStatuses(self, _graphene_info): return [ GrapheneDaemonStatus(get_daemon_status(self._instance, daemon_type)) for daemon_type in required_daemons(self._instance) ]