Example #1
0
def run_command():
    with capture_interrupts():
        with DagsterInstance.get() as instance:
            if instance.is_ephemeral:
                raise Exception(
                    "dagster-daemon can't run using an in-memory instance. Make sure "
                    "the DAGSTER_HOME environment variable has been set correctly and that "
                    "you have created a dagster.yaml file there."
                )

            with daemon_controller_from_instance(instance) as controller:
                start_time = pendulum.now("UTC")
                while True:
                    # Wait until a daemon has been unhealthy for a long period of time
                    # before potentially restarting it due to a hanging or failed daemon
                    with raise_interrupts_as(KeyboardInterrupt):
                        time.sleep(1)

                        if (
                            pendulum.now("UTC") - start_time
                        ).total_seconds() < 2 * DAEMON_HEARTBEAT_TOLERANCE_SECONDS:
                            continue

                    controller.check_daemons()
                    start_time = pendulum.now("UTC")
def test_transient_heartbeat_failure(mocker):
    with instance_for_test() as instance:
        mocker.patch(
            "dagster.daemon.controller.get_daemon_status",
            side_effect=Exception("Transient heartbeat failure"),
        )

        heartbeat_interval_seconds = 1
        heartbeat_tolerance_seconds = 5

        with daemon_controller_from_instance(
                instance,
                heartbeat_interval_seconds=heartbeat_interval_seconds,
                heartbeat_tolerance_seconds=heartbeat_tolerance_seconds,
        ) as controller:
            controller.check_daemon_heartbeats(
            )  # doesn't immediately fail despite transient error

            time.sleep(2 * heartbeat_tolerance_seconds)

            with pytest.raises(
                    Exception,
                    match=
                    "Stopping dagster-daemon process since the following threads are no longer sending heartbeats",
            ):
                controller.check_daemon_heartbeats()
Example #3
0
def test_different_intervals(caplog):
    with instance_for_test(
            overrides={
                "scheduler": {
                    "module": "dagster.core.scheduler",
                    "class": "DagsterDaemonScheduler",
                },
                "run_coordinator": {
                    "module":
                    "dagster.core.run_coordinator.queued_run_coordinator",
                    "class": "QueuedRunCoordinator",
                    "config": {
                        "dequeue_interval_seconds": 5
                    },
                },
            }) as instance:
        init_time = pendulum.now("UTC")
        with daemon_controller_from_instance(instance):
            while True:
                now = pendulum.now("UTC")
                # Wait until the run coordinator has run three times
                # Scheduler has only run once
                if _run_coordinator_ran(caplog) == 3:
                    assert _scheduler_ran(caplog) == 1
                    break

                if (now - init_time).total_seconds() > 45:
                    raise Exception(
                        "Timed out waiting for run queue daemon to execute twice"
                    )

                time.sleep(0.5)

            init_time = pendulum.now("UTC")
Example #4
0
def test_no_memory_leaks():
    with instance_for_test(
            overrides={
                "run_coordinator": {
                    "module": "dagster.core.run_coordinator",
                    "class": "QueuedRunCoordinator",
                },
                "run_launcher": {
                    "class": "DefaultRunLauncher",
                    "module": "dagster.core.launcher.default_run_launcher",
                    "config": {
                        "wait_for_processes": False,
                    },
                },
            }) as instance:
        with get_example_repo(instance) as repo:

            external_schedule = repo.get_external_schedule(
                "always_run_schedule")
            external_sensor = repo.get_external_sensor("always_on_sensor")

            instance.start_schedule(external_schedule)
            instance.start_sensor(external_sensor)

            with daemon_controller_from_instance(
                    instance,
                    workspace_load_target=workspace_load_target(),
                    wait_for_processes_on_exit=True,
            ) as controller:
                start_time = time.time()

                growth = objgraph.growth(
                    limit=10,
                    filter=lambda obj: inspect.getmodule(obj) and "dagster" in
                    inspect.getmodule(obj).__name__,
                )
                while True:
                    time.sleep(30)

                    controller.check_daemon_threads()
                    controller.check_daemon_heartbeats()

                    growth = objgraph.growth(
                        limit=10,
                        filter=lambda obj: inspect.getmodule(obj) and "dagster"
                        in inspect.getmodule(obj).__name__,
                    )
                    if not growth:
                        print(  # pylint: disable=print-call
                            f"Memory stopped growing after {int(time.time() - start_time)} seconds"
                        )
                        break

                    if (time.time() - start_time) > 300:
                        raise Exception(
                            "Memory still growing after 5 minutes. Most recent growth: "
                            + str(growth))

                    print("Growth: " + str(growth))  # pylint: disable=print-call
Example #5
0
def _daemon_run_command(instance, kwargs):
    workspace_load_target = get_workspace_load_target(kwargs)

    with daemon_controller_from_instance(
        instance,
        workspace_load_target=workspace_load_target,
        heartbeat_tolerance_seconds=_get_heartbeat_tolerance(),
    ) as controller:
        controller.check_daemon_loop()
Example #6
0
def run_command():
    with capture_interrupts():
        with DagsterInstance.get() as instance:
            if instance.is_ephemeral:
                raise Exception(
                    "dagster-daemon can't run using an in-memory instance. Make sure "
                    "the DAGSTER_HOME environment variable has been set correctly and that "
                    "you have created a dagster.yaml file there.")

            with daemon_controller_from_instance(instance) as controller:
                controller.check_daemon_loop()
def test_multiple_error_daemon(monkeypatch):
    with instance_for_test() as instance:
        from dagster.daemon.daemon import SensorDaemon

        def run_loop_error(_, _instance, _workspace):
            # ?message stack cls_name cause"
            yield SerializableErrorInfo("foobar", None, None, None)
            yield SerializableErrorInfo("bizbuz", None, None, None)

            while True:
                yield
                time.sleep(0.5)

        monkeypatch.setattr(SensorDaemon, "core_loop", run_loop_error)

        init_time = pendulum.now("UTC")

        heartbeat_interval_seconds = 1

        with daemon_controller_from_instance(
                instance,
                workspace_load_target=EmptyWorkspaceTarget(),
                heartbeat_interval_seconds=heartbeat_interval_seconds,
        ) as controller:
            while True:

                now = pendulum.now("UTC")

                if all_daemons_live(
                        instance,
                        heartbeat_interval_seconds=heartbeat_interval_seconds):

                    # Despite error, daemon should still be running
                    controller.check_daemon_threads()
                    controller.check_daemon_heartbeats()

                    status = get_daemon_statuses(
                        instance, [SensorDaemon.daemon_type()],
                        now.float_timestamp)[SensorDaemon.daemon_type()]

                    if status.healthy == False and len(
                            status.last_heartbeat.errors) == 2:
                        assert status.last_heartbeat.errors[0].message.strip(
                        ) == "bizbuz"
                        assert status.last_heartbeat.errors[1].message.strip(
                        ) == "foobar"
                        break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for heartbeat error")

                time.sleep(0.5)
Example #8
0
def test_backfill_instance():
    with instance_for_test(
        overrides={
            "backfill": {"daemon_enabled": True},
        }
    ) as instance:
        with daemon_controller_from_instance(
            instance, wait_for_processes_on_exit=True
        ) as controller:
            daemons = controller.daemons

            assert len(daemons) == 3
            assert any(isinstance(daemon, BackfillDaemon) for daemon in daemons)
def test_run_coordinator_instance():
    with instance_for_test(
        overrides={
            "run_coordinator": {
                "module": "dagster.core.run_coordinator.queued_run_coordinator",
                "class": "QueuedRunCoordinator",
            },
        }
    ) as instance:
        with daemon_controller_from_instance(instance) as controller:
            daemons = controller.daemons

            assert len(daemons) == 4
            assert any(isinstance(daemon, QueuedRunCoordinatorDaemon) for daemon in daemons)
Example #10
0
def test_scheduler_instance():
    with instance_for_test(
        overrides={
            "scheduler": {
                "module": "dagster.core.scheduler",
                "class": "DagsterDaemonScheduler",
            },
        }
    ) as instance:
        with daemon_controller_from_instance(instance) as controller:
            daemons = controller.daemons

            assert len(daemons) == 3

            assert any(isinstance(daemon, SchedulerDaemon) for daemon in daemons)
Example #11
0
def test_set_sensor_interval(caplog):
    with instance_for_test(overrides={"sensor_settings": {"interval_seconds": 5}}) as instance:

        init_time = pendulum.now("UTC")
        with daemon_controller_from_instance(instance, wait_for_processes_on_exit=True):
            while True:
                now = pendulum.now("UTC")
                # Wait until the run coordinator has run three times
                # Scheduler has only run once
                if _sensor_ran(caplog) == 1:
                    break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("Timed out waiting for sensor daemon to execute")

                time.sleep(0.5)
Example #12
0
def test_healthy_with_different_daemons():
    with instance_for_test() as instance:
        with daemon_controller_from_instance(instance):

            with instance_for_test(
                overrides={
                    "run_coordinator": {
                        "module": "dagster.core.run_coordinator.queued_run_coordinator",
                        "class": "QueuedRunCoordinator",
                    },
                }
            ) as other_instance:
                now = pendulum.now("UTC")
                assert not all_daemons_healthy(
                    other_instance, curr_time_seconds=now.float_timestamp
                )
                assert not all_daemons_live(other_instance, curr_time_seconds=now.float_timestamp)
def test_thread_die_daemon(monkeypatch):
    with instance_for_test(overrides={}) as instance:
        from dagster.daemon.daemon import SchedulerDaemon, SensorDaemon

        iteration_ran = {"ran": False}

        def run_loop_error(_, _instance, _workspace):
            iteration_ran["ran"] = True
            raise KeyboardInterrupt
            yield  # pylint: disable=unreachable

        monkeypatch.setattr(SensorDaemon, "core_loop", run_loop_error)

        heartbeat_interval_seconds = 1

        init_time = pendulum.now("UTC")
        with daemon_controller_from_instance(
                instance,
                workspace_load_target=EmptyWorkspaceTarget(),
                heartbeat_interval_seconds=heartbeat_interval_seconds,
        ) as controller:
            while True:
                now = pendulum.now("UTC")

                status = get_daemon_statuses(
                    instance,
                    [SchedulerDaemon.daemon_type()],
                    now.float_timestamp,
                    heartbeat_interval_seconds=heartbeat_interval_seconds,
                )[SchedulerDaemon.daemon_type()]

                if iteration_ran["ran"] and status.healthy:
                    try:
                        controller.check_daemon_threads(
                        )  # Should eventually throw since the sensor thread is interrupted
                    except Exception as e:
                        assert (
                            "Stopping dagster-daemon process since the following threads are no longer running: ['SENSOR']"
                            in str(e))
                        break

                if (now - init_time).total_seconds() > 20:
                    raise Exception(
                        "timed out waiting for check_daemons to fail")

                time.sleep(0.5)
Example #14
0
def test_no_memory_leaks():
    with instance_for_test(
            overrides={
                "run_coordinator": {
                    "module": "dagster.core.run_coordinator",
                    "class": "QueuedRunCoordinator",
                }
            }) as instance, get_example_repo() as repo:

        external_schedule = repo.get_external_schedule("always_run_schedule")
        external_sensor = repo.get_external_sensor("always_on_sensor")

        instance.start_schedule_and_update_storage_state(external_schedule)
        instance.start_sensor(external_sensor)

        with daemon_controller_from_instance(instance) as controller:
            start_time = time.time()

            growth = objgraph.growth(
                limit=10,
                filter=lambda obj: inspect.getmodule(obj) and "dagster" in
                inspect.getmodule(obj).__name__,
            )
            while True:
                time.sleep(30)

                controller.check_daemons()

                growth = objgraph.growth(
                    limit=10,
                    filter=lambda obj: inspect.getmodule(obj) and "dagster" in
                    inspect.getmodule(obj).__name__,
                )
                if not growth:
                    print(  # pylint: disable=print-call
                        f"Memory stopped growing after {int(time.time() - start_time)} seconds"
                    )
                    break

                if (time.time() - start_time) > 300:
                    raise Exception(
                        "Memory still growing after 5 minutes. Most recent growth: "
                        + str(growth))

                print("Growth: " + str(growth))  # pylint: disable=print-call
Example #15
0
def test_healthy():

    with instance_for_test(
            overrides={
                "run_coordinator": {
                    "module":
                    "dagster.core.run_coordinator.queued_run_coordinator",
                    "class": "QueuedRunCoordinator",
                },
            }) as instance:
        init_time = pendulum.now("UTC")

        assert not all_daemons_healthy(
            instance, curr_time_seconds=init_time.float_timestamp)
        assert not all_daemons_live(
            instance, curr_time_seconds=init_time.float_timestamp)

        with daemon_controller_from_instance(
                instance, wait_for_processes_on_exit=True) as controller:

            while True:
                now = pendulum.now("UTC")
                if all_daemons_healthy(
                        instance, curr_time_seconds=now.float_timestamp
                ) and all_daemons_live(instance,
                                       curr_time_seconds=now.float_timestamp):

                    controller.check_daemons()

                    beyond_tolerated_time = now.float_timestamp + 100

                    assert not all_daemons_healthy(
                        instance, curr_time_seconds=beyond_tolerated_time)
                    assert not all_daemons_live(
                        instance, curr_time_seconds=beyond_tolerated_time)
                    break

                if (now - init_time).total_seconds() > 10:
                    raise Exception(
                        "timed out waiting for instance to become healthy")

                time.sleep(0.5)
Example #16
0
def test_multiple_error_daemon(monkeypatch):
    with instance_for_test() as instance:
        from dagster.daemon.daemon import SensorDaemon

        def run_iteration_error(_, _instance, _daemon_shutdown_event_,
                                _grpc_server_registry):
            # ?message stack cls_name cause"
            yield SerializableErrorInfo("foobar", None, None, None)
            yield SerializableErrorInfo("bizbuz", None, None, None)

        monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error)

        init_time = pendulum.now("UTC")

        with daemon_controller_from_instance(
                instance, wait_for_processes_on_exit=True) as controller:
            while True:

                now = pendulum.now("UTC")

                if all_daemons_live(instance):

                    # Despite error, daemon should still be running
                    controller.check_daemons()

                    status = get_daemon_status(instance,
                                               SensorDaemon.daemon_type(),
                                               now.float_timestamp)

                    if status.healthy == False:
                        assert len(status.last_heartbeat.errors) == 2
                        assert status.last_heartbeat.errors[0].message.strip(
                        ) == "foobar"
                        assert status.last_heartbeat.errors[1].message.strip(
                        ) == "bizbuz"
                        break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for heartbeat error")

                time.sleep(0.5)
Example #17
0
def test_error_daemon(monkeypatch):
    with instance_for_test() as instance:
        from dagster.daemon.daemon import SensorDaemon

        def run_iteration_error(_, _instance, _daemon_shutdown_event,
                                _grpc_server_registry):
            raise DagsterInvariantViolationError("foobar")
            yield  # pylint: disable=unreachable

        monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error)

        init_time = pendulum.now("UTC")
        with daemon_controller_from_instance(
                instance, wait_for_processes_on_exit=True) as controller:
            while True:
                now = pendulum.now("UTC")

                if all_daemons_live(instance):
                    # Despite error, daemon should still be running
                    controller.check_daemons()

                    status = get_daemon_status(instance,
                                               SensorDaemon.daemon_type(),
                                               now.float_timestamp)

                    assert status.healthy == False
                    assert len(status.last_heartbeat.errors) == 1
                    assert (
                        status.last_heartbeat.errors[0].message.strip() ==
                        "dagster.core.errors.DagsterInvariantViolationError: foobar"
                    )
                    assert not all_daemons_healthy(
                        instance, curr_time_seconds=now.float_timestamp)
                    assert all_daemons_live(
                        instance, curr_time_seconds=now.float_timestamp)
                    break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for heartbeat error")

                time.sleep(0.5)
Example #18
0
def test_thread_die_daemon(monkeypatch):
    with instance_for_test(overrides={}) as instance:
        from dagster.daemon.daemon import SchedulerDaemon, SensorDaemon

        iteration_ran = {"ran": False}

        def run_iteration_error(_, _instance, _daemon_shutdown_event,
                                _grpc_server_registry):
            iteration_ran["ran"] = True
            raise KeyboardInterrupt
            yield  # pylint: disable=unreachable

        monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error)

        init_time = pendulum.now("UTC")
        with daemon_controller_from_instance(
                instance, wait_for_processes_on_exit=True) as controller:
            while True:
                now = pendulum.now("UTC")

                status = get_daemon_status(instance,
                                           SchedulerDaemon.daemon_type(),
                                           now.float_timestamp)

                if iteration_ran["ran"] and status.healthy:
                    try:
                        controller.check_daemons(
                        )  # Should eventually throw since the sensor thread is interrupted
                    except Exception as e:  # pylint: disable=broad-except
                        assert (
                            "Stopping dagster-daemon process since the following threads are no longer sending heartbeats: ['SENSOR']"
                            in str(e))
                        break

                if (now - init_time).total_seconds() > 20:
                    raise Exception(
                        "timed out waiting for check_daemons to fail")

                time.sleep(0.5)
def test_warn_multiple_daemons(capsys):
    from dagster.daemon.daemon import SensorDaemon

    with instance_for_test() as instance:
        init_time = pendulum.now("UTC")

        heartbeat_interval_seconds = 1

        with daemon_controller_from_instance(
                instance,
                heartbeat_interval_seconds=heartbeat_interval_seconds):
            while True:
                now = pendulum.now("UTC")

                if all_daemons_live(
                        instance,
                        heartbeat_interval_seconds=heartbeat_interval_seconds):
                    captured = capsys.readouterr()
                    assert "Taking over from another SENSOR daemon process" not in captured.out
                    break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for daemon status")

                time.sleep(0.5)

            capsys.readouterr()

        init_time = pendulum.now("UTC")

        status = get_daemon_status(
            instance,
            SensorDaemon.daemon_type(),
            now.float_timestamp,
            heartbeat_interval_seconds=heartbeat_interval_seconds,
        )
        last_heartbeat_time = status.last_heartbeat.timestamp

        # No warning when a second controller starts up again
        with daemon_controller_from_instance(
                instance,
                heartbeat_interval_seconds=heartbeat_interval_seconds):
            while True:
                now = pendulum.now("UTC")

                status = get_daemon_status(
                    instance,
                    SensorDaemon.daemon_type(),
                    now.float_timestamp,
                    heartbeat_interval_seconds=heartbeat_interval_seconds,
                )

                if status.last_heartbeat and status.last_heartbeat.timestamp != last_heartbeat_time:
                    captured = capsys.readouterr()
                    assert "Taking over from another SENSOR daemon process" not in captured.out
                    break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for new daemon status")

                time.sleep(0.5)

            status = get_daemon_status(
                instance,
                SensorDaemon.daemon_type(),
                now.float_timestamp,
                heartbeat_interval_seconds=heartbeat_interval_seconds,
            )
            last_heartbeat_time = status.last_heartbeat.timestamp

            # Starting up a controller while one is running produces the warning though
            with daemon_controller_from_instance(
                    instance,
                    heartbeat_interval_seconds=heartbeat_interval_seconds):
                # Wait for heartbeats while two controllers are running at once and there will
                # be a warning
                init_time = pendulum.now("UTC")

                while True:
                    now = pendulum.now("UTC")

                    captured = capsys.readouterr()
                    if "Taking over from another SENSOR daemon process" in captured.out:
                        break

                    if (now - init_time).total_seconds() > 60:
                        raise Exception("timed out waiting for heartbeats")

                    time.sleep(5)
def test_error_daemon(monkeypatch):
    with instance_for_test() as instance:
        from dagster.daemon.daemon import SensorDaemon

        should_raise_errors = True

        def run_iteration_error(_, _instance, _workspace):
            if should_raise_errors:
                raise DagsterInvariantViolationError("foobar")
            yield

        monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error)

        heartbeat_interval_seconds = 1

        gen_daemons = lambda instance: [SensorDaemon(interval_seconds=1)]

        init_time = pendulum.now("UTC")
        with daemon_controller_from_instance(
                instance,
                heartbeat_interval_seconds=heartbeat_interval_seconds,
                gen_daemons=gen_daemons,
                error_interval_seconds=10,
        ) as controller:
            while True:
                now = pendulum.now("UTC")

                if get_daemon_status(
                        instance,
                        SensorDaemon.daemon_type(),
                        heartbeat_interval_seconds=heartbeat_interval_seconds,
                        ignore_errors=True,
                ).healthy:
                    # Despite error, daemon should still be running
                    controller.check_daemon_threads()
                    controller.check_daemon_heartbeats()

                    status = get_daemon_status(
                        instance,
                        SensorDaemon.daemon_type(),
                        now.float_timestamp,
                        heartbeat_interval_seconds=heartbeat_interval_seconds,
                    )

                    assert status.healthy == False

                    # Errors build up until they hit 5
                    if len(status.last_heartbeat.errors) == 5:
                        assert (
                            status.last_heartbeat.errors[0].message.strip() ==
                            "dagster.core.errors.DagsterInvariantViolationError: foobar"
                        )
                        assert not get_daemon_status(
                            instance,
                            SensorDaemon.daemon_type(),
                            curr_time_seconds=now.float_timestamp,
                            heartbeat_interval_seconds=
                            heartbeat_interval_seconds,
                        ).healthy
                        assert get_daemon_status(
                            instance,
                            SensorDaemon.daemon_type(),
                            curr_time_seconds=now.float_timestamp,
                            heartbeat_interval_seconds=
                            heartbeat_interval_seconds,
                            ignore_errors=True,
                        ).healthy

                        time.sleep(3)

                        status = get_daemon_status(
                            instance,
                            SensorDaemon.daemon_type(),
                            now.float_timestamp,
                            heartbeat_interval_seconds=
                            heartbeat_interval_seconds,
                        )

                        # Error count does not rise above 5
                        assert len(status.last_heartbeat.errors) == 5

                        break

                if (now - init_time).total_seconds() > 15:
                    raise Exception("timed out waiting for heartbeat error")

                time.sleep(0.5)

            # Once the sensor no longer raises errors, they should return to 0 once
            # enough time passes
            should_raise_errors = False
            init_time = pendulum.now("UTC")

            while True:
                now = pendulum.now("UTC")

                status = get_daemon_status(
                    instance,
                    SensorDaemon.daemon_type(),
                    now.float_timestamp,
                    heartbeat_interval_seconds=heartbeat_interval_seconds,
                )

                # Error count does not rise above 5
                if len(status.last_heartbeat.errors) == 0:
                    break

                if (now - init_time).total_seconds() > 15:
                    raise Exception(
                        "timed out waiting for hearrteat errors to return to 0"
                    )

                time.sleep(0.5)
def test_healthy():

    with instance_for_test(
            overrides={
                "run_coordinator": {
                    "module":
                    "dagster.core.run_coordinator.queued_run_coordinator",
                    "class": "QueuedRunCoordinator",
                },
            }) as instance:
        init_time = pendulum.now("UTC")

        heartbeat_interval_seconds = 1

        assert not all_daemons_healthy(
            instance,
            curr_time_seconds=init_time.float_timestamp,
            heartbeat_interval_seconds=heartbeat_interval_seconds,
        )
        assert not all_daemons_live(
            instance,
            curr_time_seconds=init_time.float_timestamp,
            heartbeat_interval_seconds=heartbeat_interval_seconds,
        )

        with daemon_controller_from_instance(
                instance, heartbeat_interval_seconds=heartbeat_interval_seconds
        ) as controller:

            while True:
                now = pendulum.now("UTC")
                if all_daemons_healthy(
                        instance,
                        curr_time_seconds=now.float_timestamp,
                        heartbeat_interval_seconds=heartbeat_interval_seconds,
                ) and all_daemons_live(
                        instance,
                        curr_time_seconds=now.float_timestamp,
                        heartbeat_interval_seconds=heartbeat_interval_seconds,
                ):

                    controller.check_daemon_threads()
                    controller.check_daemon_heartbeats()

                    beyond_tolerated_time = (
                        now.float_timestamp +
                        DEFAULT_DAEMON_HEARTBEAT_TOLERANCE_SECONDS + 1)

                    assert not all_daemons_healthy(
                        instance,
                        curr_time_seconds=beyond_tolerated_time,
                        heartbeat_interval_seconds=heartbeat_interval_seconds,
                    )
                    assert not all_daemons_live(
                        instance,
                        curr_time_seconds=beyond_tolerated_time,
                        heartbeat_interval_seconds=heartbeat_interval_seconds,
                    )
                    break

                if (now - init_time).total_seconds() > 10:
                    raise Exception(
                        "timed out waiting for instance to become healthy")

                time.sleep(0.5)
Example #22
0
def test_error_daemon(monkeypatch):
    with instance_for_test() as instance:
        from dagster.daemon.daemon import SensorDaemon

        should_raise_errors = True

        error_count = {"count": 0}

        def run_loop_error(_, _instance, _workspace):
            if should_raise_errors:
                time.sleep(0.5)
                error_count["count"] = error_count["count"] + 1
                raise DagsterInvariantViolationError("foobar:" + str(error_count["count"]))

            while True:
                yield
                time.sleep(0.5)

        def _get_error_number(error):
            error_message = error.message.strip()
            return int(error_message.split("foobar:")[1])

        monkeypatch.setattr(SensorDaemon, "core_loop", run_loop_error)

        heartbeat_interval_seconds = 1

        gen_daemons = lambda instance: [SensorDaemon()]

        init_time = pendulum.now("UTC")
        with daemon_controller_from_instance(
            instance,
            workspace_load_target=EmptyWorkspaceTarget(),
            heartbeat_interval_seconds=heartbeat_interval_seconds,
            gen_daemons=gen_daemons,
            error_interval_seconds=10,
        ) as controller:
            while True:
                now = pendulum.now("UTC")

                if get_daemon_status(
                    instance,
                    SensorDaemon.daemon_type(),
                    heartbeat_interval_seconds=heartbeat_interval_seconds,
                    ignore_errors=True,
                ).healthy:
                    # Despite error, daemon should still be running
                    controller.check_daemon_threads()
                    controller.check_daemon_heartbeats()

                    status = get_daemon_status(
                        instance,
                        SensorDaemon.daemon_type(),
                        now.float_timestamp,
                        heartbeat_interval_seconds=heartbeat_interval_seconds,
                    )

                    assert status.healthy == False

                    # Errors build up until there are > 5, then pull off the last
                    if len(status.last_heartbeat.errors) >= 5:

                        first_error_number = _get_error_number(status.last_heartbeat.errors[0])

                        if first_error_number > 5:

                            # Verify error numbers decrease consecutively
                            assert [
                                _get_error_number(error) for error in status.last_heartbeat.errors
                            ] == list(range(first_error_number, first_error_number - 5, -1))

                            assert not get_daemon_status(
                                instance,
                                SensorDaemon.daemon_type(),
                                curr_time_seconds=now.float_timestamp,
                                heartbeat_interval_seconds=heartbeat_interval_seconds,
                            ).healthy
                            assert get_daemon_status(
                                instance,
                                SensorDaemon.daemon_type(),
                                curr_time_seconds=now.float_timestamp,
                                heartbeat_interval_seconds=heartbeat_interval_seconds,
                                ignore_errors=True,
                            ).healthy

                            time.sleep(3)

                            status = get_daemon_status(
                                instance,
                                SensorDaemon.daemon_type(),
                                now.float_timestamp,
                                heartbeat_interval_seconds=heartbeat_interval_seconds,
                            )

                            # Error count does not rise above 5, continues to increase
                            assert len(status.last_heartbeat.errors) == 5

                            new_first_error_number = _get_error_number(
                                status.last_heartbeat.errors[0]
                            )

                            assert new_first_error_number > first_error_number

                            break

                if (now - init_time).total_seconds() > 15:
                    raise Exception("timed out waiting for heartbeat error")

                time.sleep(0.5)

            # Once the sensor no longer raises errors, they should return to 0 once
            # enough time passes
            should_raise_errors = False
            init_time = pendulum.now("UTC")

            while True:
                now = pendulum.now("UTC")

                status = get_daemon_status(
                    instance,
                    SensorDaemon.daemon_type(),
                    now.float_timestamp,
                    heartbeat_interval_seconds=heartbeat_interval_seconds,
                )

                # Error count does not rise above 5
                if len(status.last_heartbeat.errors) == 0:
                    break

                if (now - init_time).total_seconds() > 15:
                    raise Exception("timed out waiting for hearrteat errors to return to 0")

                time.sleep(0.5)
Example #23
0
def _daemon_run_command(instance):
    with daemon_controller_from_instance(
            instance, heartbeat_tolerance_seconds=_get_heartbeat_tolerance(
            )) as controller:
        controller.check_daemon_loop()