Exemple #1
0
def _create_sensor_tick(instance):
    with ProcessGrpcServerRegistry(wait_for_processes_on_exit=True) as grpc_server_registry:
        with RepositoryLocationHandleManager(grpc_server_registry) as handle_manager:
            list(
                execute_sensor_iteration(
                    instance, get_default_daemon_logger("SensorDaemon"), handle_manager
                )
            )
Exemple #2
0
def evaluate_sensors(instance, grpc_server_registry):

    with RepositoryLocationHandleManager(
            grpc_server_registry) as handle_manager:
        list(
            execute_sensor_iteration(
                instance,
                get_default_daemon_logger("SensorDaemon"),
                handle_manager,
            ))
Exemple #3
0
def launch_scheduled_runs(
    instance,
    grpc_server_registry,
    logger,
    end_datetime_utc,
    max_catchup_runs=DEFAULT_MAX_CATCHUP_RUNS,
    debug_crash_flags=None,
):
    schedules = [
        s for s in instance.all_stored_job_state(job_type=JobType.SCHEDULE)
        if s.status == JobStatus.RUNNING
    ]

    if not schedules:
        logger.info(
            "Not checking for any runs since no schedules have been started.")
        return

    schedule_names = ", ".join([schedule.job_name for schedule in schedules])
    logger.info(
        f"Checking for new runs for the following schedules: {schedule_names}")

    with RepositoryLocationHandleManager(
            grpc_server_registry) as handle_manager:
        for schedule_state in schedules:
            error_info = None
            try:
                origin = schedule_state.origin.external_repository_origin.repository_location_origin
                repo_location_handle = handle_manager.get_handle(origin)
                repo_location = repo_location_handle.create_location()
                yield from launch_scheduled_runs_for_schedule(
                    instance,
                    logger,
                    schedule_state,
                    repo_location,
                    end_datetime_utc,
                    max_catchup_runs,
                    (debug_crash_flags.get(schedule_state.job_name)
                     if debug_crash_flags else None),
                )
            except Exception:  # pylint: disable=broad-except
                error_info = serializable_error_info_from_exc_info(
                    sys.exc_info())
                logger.error(
                    f"Scheduler caught an error for schedule {schedule_state.job_name} : {error_info.to_string()}"
                )
            yield error_info
def _test_launch_sensor_runs_in_subprocess(instance_ref, execution_datetime, debug_crash_flags):
    with DagsterInstance.from_ref(instance_ref) as instance:
        try:
            with pendulum.test(execution_datetime), ProcessGrpcServerRegistry(
                wait_for_processes_on_exit=True
            ) as grpc_server_registry:
                with RepositoryLocationHandleManager(grpc_server_registry) as handle_manager:
                    list(
                        execute_sensor_iteration(
                            instance,
                            get_default_daemon_logger("SensorDaemon"),
                            handle_manager,
                            debug_crash_flags=debug_crash_flags,
                        )
                    )
        finally:
            cleanup_test_instance(instance)
Exemple #5
0
def execute_sensor_iteration_loop(
    instance, grpc_server_registry, logger, daemon_shutdown_event, until=None
):
    """
    Helper function that performs sensor evaluations on a tighter loop, while reusing grpc locations
    within a given daemon interval.  Rather than relying on the daemon machinery to run the
    iteration loop every 30 seconds, sensors are continuously evaluated, every 5 seconds. We rely on
    each sensor definition's min_interval to check that sensor evaluations are spaced appropriately.
    """
    from dagster.daemon.daemon import CompletedIteration

    handle_manager = None
    manager_loaded_time = None

    RELOAD_LOCATION_MANAGER_INTERVAL = 60

    start_time = pendulum.now("UTC").timestamp()
    with ExitStack() as stack:
        while not daemon_shutdown_event or not daemon_shutdown_event.is_set():
            start_time = pendulum.now("UTC").timestamp()
            if until and start_time >= until:
                # provide a way of organically ending the loop to support test environment
                break

            if (
                not handle_manager
                or (start_time - manager_loaded_time) > RELOAD_LOCATION_MANAGER_INTERVAL
            ):
                stack.close()  # remove the previous context
                handle_manager = stack.enter_context(
                    RepositoryLocationHandleManager(grpc_server_registry)
                )
                manager_loaded_time = start_time

            yield from execute_sensor_iteration(instance, logger, handle_manager)
            loop_duration = pendulum.now("UTC").timestamp() - start_time
            sleep_time = max(0, MIN_INTERVAL_LOOP_TIME - loop_duration)
            yield CompletedIteration()
            time.sleep(sleep_time)
Exemple #6
0
    def run_iteration(self, instance, grpc_server_registry):
        in_progress_runs = self._get_in_progress_runs(instance)
        max_runs_to_launch = self._max_concurrent_runs - len(in_progress_runs)

        # Possibly under 0 if runs were launched without queuing
        if max_runs_to_launch <= 0:
            self._logger.info(
                "{} runs are currently in progress. Maximum is {}, won't launch more."
                .format(len(in_progress_runs), self._max_concurrent_runs))
            return

        queued_runs = self._get_queued_runs(instance)

        if not queued_runs:
            self._logger.info("Poll returned no queued runs.")
        else:
            self._logger.info(
                "Retrieved {} queued runs, checking limits.".format(
                    len(queued_runs)))

        # place in order
        sorted_runs = self._priority_sort(queued_runs)

        # launch until blocked by limit rules
        num_dequeued_runs = 0
        tag_concurrency_limits_counter = _TagConcurrencyLimitsCounter(
            self._tag_concurrency_limits, in_progress_runs)

        with RepositoryLocationHandleManager(
                grpc_server_registry) as location_manager:
            for run in sorted_runs:
                if num_dequeued_runs >= max_runs_to_launch:
                    break

                if tag_concurrency_limits_counter.is_run_blocked(run):
                    continue

                error_info = None

                try:
                    self._dequeue_run(instance, run, location_manager)
                except Exception:  # pylint: disable=broad-except
                    error_info = serializable_error_info_from_exc_info(
                        sys.exc_info())

                    message = (
                        f"Caught an error for run {run.run_id} while removing it from the queue."
                        " Marking the run as failed and dropping it from the queue"
                    )
                    message_with_full_error = f"{message}: {error_info.to_string()}"

                    self._logger.error(message_with_full_error)
                    instance.report_run_failed(run, message_with_full_error)

                    # modify the original error, so that the extra message appears in heartbeats
                    error_info = error_info._replace(
                        message=f"{message}: {error_info.message}")

                else:
                    tag_concurrency_limits_counter.update_counters_with_launched_run(
                        run)
                    num_dequeued_runs += 1

                yield error_info

        self._logger.info("Launched {} runs.".format(num_dequeued_runs))