def test_get_required_daemon_types(): from dagster.daemon.daemon import ( SensorDaemon, BackfillDaemon, SchedulerDaemon, MonitoringDaemon, ) with instance_for_test() as instance: assert instance.get_required_daemon_types() == [ SensorDaemon.daemon_type(), BackfillDaemon.daemon_type(), SchedulerDaemon.daemon_type(), ] with instance_for_test( overrides={ "run_launcher": { "module": "dagster_tests.daemon_tests.test_monitoring_daemon", "class": "TestRunLauncher", }, "run_monitoring": {"enabled": True}, } ) as instance: assert instance.get_required_daemon_types() == [ SensorDaemon.daemon_type(), BackfillDaemon.daemon_type(), SchedulerDaemon.daemon_type(), MonitoringDaemon.daemon_type(), ]
def test_add_heartbeat(self, storage): self._skip_in_memory(storage) # test insert added_heartbeat = DaemonHeartbeat( timestamp=pendulum.from_timestamp(1000).float_timestamp, daemon_type=SensorDaemon.daemon_type(), daemon_id=None, errors=[], ) storage.add_daemon_heartbeat(added_heartbeat) assert len(storage.get_daemon_heartbeats()) == 1 stored_heartbeat = storage.get_daemon_heartbeats()[ SensorDaemon.daemon_type()] assert stored_heartbeat == added_heartbeat # test update second_added_heartbeat = DaemonHeartbeat( timestamp=pendulum.from_timestamp(2000).float_timestamp, daemon_type=SensorDaemon.daemon_type(), daemon_id=None, errors=[], ) storage.add_daemon_heartbeat(second_added_heartbeat) assert len(storage.get_daemon_heartbeats()) == 1 stored_heartbeat = storage.get_daemon_heartbeats()[ SensorDaemon.daemon_type()] assert stored_heartbeat == second_added_heartbeat
def debug_daemon_heartbeats(instance): daemon = SensorDaemon() timestamp = pendulum.now("UTC").float_timestamp instance.add_daemon_heartbeat(DaemonHeartbeat(timestamp, daemon.daemon_type(), None, None)) returned_timestamp = instance.get_daemon_heartbeats()[daemon.daemon_type()].timestamp print( # pylint: disable=print-call f"Written timestamp: {timestamp}\nRead timestamp: {returned_timestamp}" )
def debug_daemon_heartbeats(instance): daemon = SensorDaemon(instance, interval_seconds=DEFAULT_DAEMON_INTERVAL_SECONDS,) timestamp = pendulum.now("UTC").float_timestamp instance.add_daemon_heartbeat(DaemonHeartbeat(timestamp, daemon.daemon_type(), None, None)) returned_timestamp = instance.get_daemon_heartbeats()[daemon.daemon_type()].timestamp print( # pylint: disable=print-call f"Written timetstamp: {timestamp}\nRead timestamp: {returned_timestamp}" )
def create_daemon_of_type(daemon_type): if daemon_type == SchedulerDaemon.daemon_type(): return SchedulerDaemon.create_from_instance(DagsterInstance.get()) elif daemon_type == SensorDaemon.daemon_type(): return SensorDaemon.create_from_instance(DagsterInstance.get()) elif daemon_type == QueuedRunCoordinatorDaemon.daemon_type(): return QueuedRunCoordinatorDaemon.create_from_instance( DagsterInstance.get()) else: raise Exception("Unexpected daemon type {daemon_type}".format( daemon_type=daemon_type))
def test_multiple_error_daemon(monkeypatch): with instance_for_test() as instance: from dagster.daemon.daemon import SensorDaemon def run_loop_error(_, _instance, _workspace): # ?message stack cls_name cause" yield SerializableErrorInfo("foobar", None, None, None) yield SerializableErrorInfo("bizbuz", None, None, None) while True: yield time.sleep(0.5) monkeypatch.setattr(SensorDaemon, "core_loop", run_loop_error) init_time = pendulum.now("UTC") heartbeat_interval_seconds = 1 with daemon_controller_from_instance( instance, workspace_load_target=EmptyWorkspaceTarget(), heartbeat_interval_seconds=heartbeat_interval_seconds, ) as controller: while True: now = pendulum.now("UTC") if all_daemons_live( instance, heartbeat_interval_seconds=heartbeat_interval_seconds): # Despite error, daemon should still be running controller.check_daemon_threads() controller.check_daemon_heartbeats() status = get_daemon_statuses( instance, [SensorDaemon.daemon_type()], now.float_timestamp)[SensorDaemon.daemon_type()] if status.healthy == False and len( status.last_heartbeat.errors) == 2: assert status.last_heartbeat.errors[0].message.strip( ) == "bizbuz" assert status.last_heartbeat.errors[1].message.strip( ) == "foobar" break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for heartbeat error") time.sleep(0.5)
def create_daemon_of_type(daemon_type, instance): if daemon_type == SchedulerDaemon.daemon_type(): return SchedulerDaemon( interval_seconds=DEFAULT_DAEMON_INTERVAL_SECONDS) elif daemon_type == SensorDaemon.daemon_type(): return SensorDaemon(interval_seconds=DEFAULT_SENSOR_DAEMON_INTERVAL) elif daemon_type == QueuedRunCoordinatorDaemon.daemon_type(): return QueuedRunCoordinatorDaemon( interval_seconds=instance.run_coordinator.dequeue_interval_seconds) elif daemon_type == BackfillDaemon.daemon_type(): return BackfillDaemon(interval_seconds=DEFAULT_DAEMON_INTERVAL_SECONDS) else: raise Exception(f"Unexpected daemon type {daemon_type}")
def __init__(self, instance): self._instance = instance self._daemons = {} self._logger = get_default_daemon_logger("dagster-daemon") if isinstance(instance.scheduler, DagsterDaemonScheduler): max_catchup_runs = instance.scheduler.max_catchup_runs self._add_daemon( SchedulerDaemon(instance, interval_seconds=30, max_catchup_runs=max_catchup_runs)) self._add_daemon(SensorDaemon(instance, interval_seconds=30)) if isinstance(instance.run_coordinator, QueuedRunCoordinator): max_concurrent_runs = instance.run_coordinator.max_concurrent_runs dequeue_interval_seconds = instance.run_coordinator.dequeue_interval_seconds self._add_daemon( QueuedRunCoordinatorDaemon( instance, interval_seconds=dequeue_interval_seconds, max_concurrent_runs=max_concurrent_runs, )) if not self._daemons: raise Exception("No daemons configured on the DagsterInstance") self._logger.info( "instance is configured with the following daemons: {}".format( _sorted_quoted( type(daemon).__name__ for daemon in self.daemons)))
def required_daemons(instance): """ Return which daemon types are required by the instance """ daemons = [SensorDaemon.daemon_type()] if isinstance(instance.scheduler, DagsterDaemonScheduler): daemons.append(SchedulerDaemon.daemon_type()) if isinstance(instance.run_coordinator, QueuedRunCoordinator): daemons.append(QueuedRunCoordinatorDaemon.daemon_type()) return daemons
def test_wipe_heartbeats(self, storage): self._skip_in_memory(storage) added_heartbeat = DaemonHeartbeat( timestamp=pendulum.from_timestamp(1000).float_timestamp, daemon_type=SensorDaemon.daemon_type(), daemon_id=None, errors=[], ) storage.add_daemon_heartbeat(added_heartbeat) storage.wipe_daemon_heartbeats()
def create_daemons_from_instance(instance): daemon_types = required_daemons(instance) daemons = [] # Separate instance for each daemon since each is in its own thread for daemon_type in daemon_types: if daemon_type == SchedulerDaemon.daemon_type(): daemons.append( SchedulerDaemon.create_from_instance(DagsterInstance.get())) elif daemon_type == SensorDaemon.daemon_type(): daemons.append( SensorDaemon.create_from_instance(DagsterInstance.get())) elif daemon_type == QueuedRunCoordinatorDaemon.daemon_type(): daemons.append( QueuedRunCoordinatorDaemon.create_from_instance( DagsterInstance.get())) else: raise Exception("Unexpected daemon type {daemon_type}".format( daemon_type=daemon_type)) return daemons
def test_wipe_heartbeats(self, storage): self._skip_in_memory(storage) if not self.can_delete_runs(): pytest.skip("storage cannot delete") added_heartbeat = DaemonHeartbeat( timestamp=pendulum.from_timestamp(1000).float_timestamp, daemon_type=SensorDaemon.daemon_type(), daemon_id=None, errors=[], ) storage.add_daemon_heartbeat(added_heartbeat) storage.wipe_daemon_heartbeats()
def __init__(self, instance): self._instance = instance self._daemon_uuid = str(uuid.uuid4()) self._daemons = {} self._last_heartbeat_times = {} self._last_iteration_times = {} self._last_iteration_exceptions = {} self._current_iteration_exceptions = {} self._logger = get_default_daemon_logger("dagster-daemon") if isinstance(instance.scheduler, DagsterDaemonScheduler): max_catchup_runs = instance.scheduler.max_catchup_runs self._add_daemon( SchedulerDaemon( instance, interval_seconds=DEFAULT_DAEMON_INTERVAL_SECONDS, max_catchup_runs=max_catchup_runs, ) ) self._add_daemon(SensorDaemon(instance, interval_seconds=SENSOR_DAEMON_INTERVAL,)) if isinstance(instance.run_coordinator, QueuedRunCoordinator): max_concurrent_runs = instance.run_coordinator.max_concurrent_runs tag_concurrency_limits = instance.run_coordinator.tag_concurrency_limits self._add_daemon( QueuedRunCoordinatorDaemon( instance, interval_seconds=instance.run_coordinator.dequeue_interval_seconds, max_concurrent_runs=max_concurrent_runs, tag_concurrency_limits=tag_concurrency_limits, ) ) assert set(required_daemons(instance)) == self._daemons.keys() if not self._daemons: raise Exception("No daemons configured on the DagsterInstance") self._logger.info( "instance is configured with the following daemons: {}".format( _sorted_quoted(type(daemon).__name__ for daemon in self.daemons) ) )
def test_error_sensor_daemon(external_repo_context, monkeypatch): freeze_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=28, tz="UTC"), "US/Central") sleeps = [] def fake_sleep(s): sleeps.append(s) pendulum.set_test_now(pendulum.now().add(seconds=s)) monkeypatch.setattr(time, "sleep", fake_sleep) with instance_with_sensors( external_repo_context, overrides={ "run_launcher": { "module": "dagster.core.test_utils", "class": "ExplodingRunLauncher", }, }, ) as (instance, workspace, _external_repo): @contextmanager def _gen_workspace(_instance): yield workspace with pendulum.test(freeze_datetime): instance.add_job_state( JobState(_get_unloadable_sensor_origin(), JobType.SENSOR, JobStatus.RUNNING)) sensor_daemon = SensorDaemon.create_from_instance(instance) daemon_shutdown_event = threading.Event() sensor_daemon.run_loop( "my_uuid", daemon_shutdown_event, _gen_workspace, heartbeat_interval_seconds=DEFAULT_HEARTBEAT_INTERVAL_SECONDS, error_interval_seconds=DEFAULT_DAEMON_ERROR_INTERVAL_SECONDS, until=freeze_datetime.add(seconds=65), ) heartbeats = instance.get_daemon_heartbeats() heartbeat = heartbeats["SENSOR"] assert heartbeat assert heartbeat.errors assert len(heartbeat.errors) == DAEMON_HEARTBEAT_ERROR_LIMIT
def __init__(self, instance): self._instance = instance self._daemon_uuid = str(uuid.uuid4()) self._daemons = {} self._last_heartbeat_time = None self._logger = get_default_daemon_logger("dagster-daemon") if isinstance(instance.scheduler, DagsterDaemonScheduler): max_catchup_runs = instance.scheduler.max_catchup_runs self._add_daemon( SchedulerDaemon( instance, interval_seconds=self._get_interval_seconds( instance, SchedulerDaemon.__name__), max_catchup_runs=max_catchup_runs, )) self._add_daemon( SensorDaemon( instance, interval_seconds=self._get_interval_seconds( instance, SensorDaemon.__name__), )) if isinstance(instance.run_coordinator, QueuedRunCoordinator): max_concurrent_runs = instance.run_coordinator.max_concurrent_runs self._add_daemon( QueuedRunCoordinatorDaemon( instance, interval_seconds=self._get_interval_seconds( instance, QueuedRunCoordinatorDaemon.__name__), max_concurrent_runs=max_concurrent_runs, )) assert set(self._expected_daemons(instance)) == self._daemons.keys() if not self._daemons: raise Exception("No daemons configured on the DagsterInstance") self._logger.info( "instance is configured with the following daemons: {}".format( _sorted_quoted( type(daemon).__name__ for daemon in self.daemons)))
def test_error_daemon(monkeypatch): with instance_for_test(overrides={}) as instance: from dagster.daemon.daemon import SensorDaemon def run_iteration_error(_): raise DagsterInvariantViolationError("foobar") monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) controller = DagsterDaemonController(instance) init_time = pendulum.now("UTC") controller.run_iteration(init_time) status = get_daemon_status(instance, SensorDaemon.daemon_type(), init_time.float_timestamp) assert status.healthy == False assert (status.last_heartbeat.error.message.strip() == "dagster.core.errors.DagsterInvariantViolationError: foobar")
def test_multiple_error_daemon(monkeypatch): with instance_for_test(overrides={}) as instance: from dagster.daemon.daemon import SensorDaemon def run_iteration_error(_): # ?message stack cls_name cause" yield SerializableErrorInfo("foobar", None, None, None) yield SerializableErrorInfo("bizbuz", None, None, None) monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) controller = DagsterDaemonController(instance) init_time = pendulum.now("UTC") controller.run_iteration(init_time) status = get_daemon_status(instance, SensorDaemon.daemon_type(), init_time.float_timestamp) assert status.healthy == False assert len(status.last_heartbeat.errors) == 2 assert status.last_heartbeat.errors[0].message.strip() == "foobar" assert status.last_heartbeat.errors[1].message.strip() == "bizbuz"
def test_error_daemon(monkeypatch): with instance_for_test() as instance: from dagster.daemon.daemon import SensorDaemon def run_iteration_error(_, _instance, _daemon_shutdown_event, _grpc_server_registry): raise DagsterInvariantViolationError("foobar") yield # pylint: disable=unreachable monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) init_time = pendulum.now("UTC") with daemon_controller_from_instance( instance, wait_for_processes_on_exit=True) as controller: while True: now = pendulum.now("UTC") if all_daemons_live(instance): # Despite error, daemon should still be running controller.check_daemons() status = get_daemon_status(instance, SensorDaemon.daemon_type(), now.float_timestamp) assert status.healthy == False assert len(status.last_heartbeat.errors) == 1 assert ( status.last_heartbeat.errors[0].message.strip() == "dagster.core.errors.DagsterInvariantViolationError: foobar" ) assert not all_daemons_healthy( instance, curr_time_seconds=now.float_timestamp) assert all_daemons_live( instance, curr_time_seconds=now.float_timestamp) break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for heartbeat error") time.sleep(0.5)
def test_multiple_error_daemon(monkeypatch): with instance_for_test() as instance: from dagster.daemon.daemon import SensorDaemon def run_iteration_error(_, _instance, _daemon_shutdown_event_, _grpc_server_registry): # ?message stack cls_name cause" yield SerializableErrorInfo("foobar", None, None, None) yield SerializableErrorInfo("bizbuz", None, None, None) monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) init_time = pendulum.now("UTC") with daemon_controller_from_instance( instance, wait_for_processes_on_exit=True) as controller: while True: now = pendulum.now("UTC") if all_daemons_live(instance): # Despite error, daemon should still be running controller.check_daemons() status = get_daemon_status(instance, SensorDaemon.daemon_type(), now.float_timestamp) if status.healthy == False: assert len(status.last_heartbeat.errors) == 2 assert status.last_heartbeat.errors[0].message.strip( ) == "foobar" assert status.last_heartbeat.errors[1].message.strip( ) == "bizbuz" break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for heartbeat error") time.sleep(0.5)
def test_get_daemon_error(self, graphql_context): if graphql_context.instance.is_ephemeral: pytest.skip( "The daemon isn't compatible with an in-memory instance") graphql_context.instance.add_daemon_heartbeat( DaemonHeartbeat( timestamp=100.0, daemon_type=SensorDaemon.daemon_type(), daemon_id=None, errors=[ SerializableErrorInfo(message="foobar", stack=[], cls_name=None, cause=None) ], )) results = execute_dagster_graphql(graphql_context, DAEMON_HEALTH_QUERY) assert results.data["instance"]["daemonHealth"]["sensor"] == { "lastHeartbeatErrors": [{ "message": "foobar" }], }
def test_get_individual_daemons(self, graphql_context): if graphql_context.instance.is_ephemeral: pytest.skip( "The daemon isn't compatible with an in-memory instance") graphql_context.instance.add_daemon_heartbeat( DaemonHeartbeat(timestamp=100.0, daemon_type=SensorDaemon.daemon_type(), daemon_id=None, errors=None)) results = execute_dagster_graphql(graphql_context, INDIVIDUAL_DAEMON_QUERY) scheduler_required = isinstance(graphql_context.instance.scheduler, DagsterDaemonScheduler) assert results.data == { "instance": { "daemonHealth": { "id": "daemonHealth", "sensor": { "daemonType": "SENSOR", "required": True, "healthy": False, "lastHeartbeatTime": 100.0, }, "run_coordinator": { "daemonType": "QUEUED_RUN_COORDINATOR", "required": False, "healthy": None, "lastHeartbeatTime": None, }, "scheduler": { "daemonType": "SCHEDULER", "required": scheduler_required, "healthy": False if scheduler_required else None, "lastHeartbeatTime": None, }, } } }
def test_error_daemon(monkeypatch): with instance_for_test(overrides={}) as instance: from dagster.daemon.daemon import SensorDaemon def run_iteration_error(_): raise DagsterInvariantViolationError("foobar") yield # pylint: disable=unreachable monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) controller = DagsterDaemonController(instance) init_time = pendulum.now("UTC") controller.run_iteration(init_time) status = get_daemon_status(instance, SensorDaemon.daemon_type(), init_time.float_timestamp) assert status.healthy == False assert len(status.last_heartbeat.errors) == 1 assert (status.last_heartbeat.errors[0].message.strip() == "dagster.core.errors.DagsterInvariantViolationError: foobar") assert not all_daemons_healthy( instance, curr_time_seconds=init_time.float_timestamp) assert all_daemons_live(instance, curr_time_seconds=init_time.float_timestamp)
def test_warn_multiple_daemons(capsys): from dagster.daemon.daemon import SensorDaemon with instance_for_test() as instance: init_time = pendulum.now("UTC") heartbeat_interval_seconds = 1 with daemon_controller_from_instance( instance, heartbeat_interval_seconds=heartbeat_interval_seconds): while True: now = pendulum.now("UTC") if all_daemons_live( instance, heartbeat_interval_seconds=heartbeat_interval_seconds): captured = capsys.readouterr() assert "Taking over from another SENSOR daemon process" not in captured.out break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for daemon status") time.sleep(0.5) capsys.readouterr() init_time = pendulum.now("UTC") status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) last_heartbeat_time = status.last_heartbeat.timestamp # No warning when a second controller starts up again with daemon_controller_from_instance( instance, heartbeat_interval_seconds=heartbeat_interval_seconds): while True: now = pendulum.now("UTC") status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) if status.last_heartbeat and status.last_heartbeat.timestamp != last_heartbeat_time: captured = capsys.readouterr() assert "Taking over from another SENSOR daemon process" not in captured.out break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for new daemon status") time.sleep(0.5) status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) last_heartbeat_time = status.last_heartbeat.timestamp # Starting up a controller while one is running produces the warning though with daemon_controller_from_instance( instance, heartbeat_interval_seconds=heartbeat_interval_seconds): # Wait for heartbeats while two controllers are running at once and there will # be a warning init_time = pendulum.now("UTC") while True: now = pendulum.now("UTC") captured = capsys.readouterr() if "Taking over from another SENSOR daemon process" in captured.out: break if (now - init_time).total_seconds() > 60: raise Exception("timed out waiting for heartbeats") time.sleep(5)
def test_error_daemon(monkeypatch): with instance_for_test() as instance: from dagster.daemon.daemon import SensorDaemon should_raise_errors = True def run_iteration_error(_, _instance, _workspace): if should_raise_errors: raise DagsterInvariantViolationError("foobar") yield monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) heartbeat_interval_seconds = 1 gen_daemons = lambda instance: [SensorDaemon(interval_seconds=1)] init_time = pendulum.now("UTC") with daemon_controller_from_instance( instance, heartbeat_interval_seconds=heartbeat_interval_seconds, gen_daemons=gen_daemons, error_interval_seconds=10, ) as controller: while True: now = pendulum.now("UTC") if get_daemon_status( instance, SensorDaemon.daemon_type(), heartbeat_interval_seconds=heartbeat_interval_seconds, ignore_errors=True, ).healthy: # Despite error, daemon should still be running controller.check_daemon_threads() controller.check_daemon_heartbeats() status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) assert status.healthy == False # Errors build up until they hit 5 if len(status.last_heartbeat.errors) == 5: assert ( status.last_heartbeat.errors[0].message.strip() == "dagster.core.errors.DagsterInvariantViolationError: foobar" ) assert not get_daemon_status( instance, SensorDaemon.daemon_type(), curr_time_seconds=now.float_timestamp, heartbeat_interval_seconds= heartbeat_interval_seconds, ).healthy assert get_daemon_status( instance, SensorDaemon.daemon_type(), curr_time_seconds=now.float_timestamp, heartbeat_interval_seconds= heartbeat_interval_seconds, ignore_errors=True, ).healthy time.sleep(3) status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds= heartbeat_interval_seconds, ) # Error count does not rise above 5 assert len(status.last_heartbeat.errors) == 5 break if (now - init_time).total_seconds() > 15: raise Exception("timed out waiting for heartbeat error") time.sleep(0.5) # Once the sensor no longer raises errors, they should return to 0 once # enough time passes should_raise_errors = False init_time = pendulum.now("UTC") while True: now = pendulum.now("UTC") status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) # Error count does not rise above 5 if len(status.last_heartbeat.errors) == 0: break if (now - init_time).total_seconds() > 15: raise Exception( "timed out waiting for hearrteat errors to return to 0" ) time.sleep(0.5)
def test_error_daemon(monkeypatch): with instance_for_test() as instance: from dagster.daemon.daemon import SensorDaemon should_raise_errors = True error_count = {"count": 0} def run_loop_error(_, _instance, _workspace): if should_raise_errors: time.sleep(0.5) error_count["count"] = error_count["count"] + 1 raise DagsterInvariantViolationError("foobar:" + str(error_count["count"])) while True: yield time.sleep(0.5) def _get_error_number(error): error_message = error.message.strip() return int(error_message.split("foobar:")[1]) monkeypatch.setattr(SensorDaemon, "core_loop", run_loop_error) heartbeat_interval_seconds = 1 gen_daemons = lambda instance: [SensorDaemon()] init_time = pendulum.now("UTC") with daemon_controller_from_instance( instance, workspace_load_target=EmptyWorkspaceTarget(), heartbeat_interval_seconds=heartbeat_interval_seconds, gen_daemons=gen_daemons, error_interval_seconds=10, ) as controller: while True: now = pendulum.now("UTC") if get_daemon_status( instance, SensorDaemon.daemon_type(), heartbeat_interval_seconds=heartbeat_interval_seconds, ignore_errors=True, ).healthy: # Despite error, daemon should still be running controller.check_daemon_threads() controller.check_daemon_heartbeats() status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) assert status.healthy == False # Errors build up until there are > 5, then pull off the last if len(status.last_heartbeat.errors) >= 5: first_error_number = _get_error_number(status.last_heartbeat.errors[0]) if first_error_number > 5: # Verify error numbers decrease consecutively assert [ _get_error_number(error) for error in status.last_heartbeat.errors ] == list(range(first_error_number, first_error_number - 5, -1)) assert not get_daemon_status( instance, SensorDaemon.daemon_type(), curr_time_seconds=now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ).healthy assert get_daemon_status( instance, SensorDaemon.daemon_type(), curr_time_seconds=now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ignore_errors=True, ).healthy time.sleep(3) status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) # Error count does not rise above 5, continues to increase assert len(status.last_heartbeat.errors) == 5 new_first_error_number = _get_error_number( status.last_heartbeat.errors[0] ) assert new_first_error_number > first_error_number break if (now - init_time).total_seconds() > 15: raise Exception("timed out waiting for heartbeat error") time.sleep(0.5) # Once the sensor no longer raises errors, they should return to 0 once # enough time passes should_raise_errors = False init_time = pendulum.now("UTC") while True: now = pendulum.now("UTC") status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) # Error count does not rise above 5 if len(status.last_heartbeat.errors) == 0: break if (now - init_time).total_seconds() > 15: raise Exception("timed out waiting for hearrteat errors to return to 0") time.sleep(0.5)