def _create_sensor_tick(instance): with ProcessGrpcServerRegistry(wait_for_processes_on_exit=True) as grpc_server_registry: with RepositoryLocationHandleManager(grpc_server_registry) as handle_manager: list( execute_sensor_iteration( instance, get_default_daemon_logger("SensorDaemon"), handle_manager ) )
def evaluate_sensors(instance, grpc_server_registry): with RepositoryLocationHandleManager( grpc_server_registry) as handle_manager: list( execute_sensor_iteration( instance, get_default_daemon_logger("SensorDaemon"), handle_manager, ))
def launch_scheduled_runs( instance, grpc_server_registry, logger, end_datetime_utc, max_catchup_runs=DEFAULT_MAX_CATCHUP_RUNS, debug_crash_flags=None, ): schedules = [ s for s in instance.all_stored_job_state(job_type=JobType.SCHEDULE) if s.status == JobStatus.RUNNING ] if not schedules: logger.info( "Not checking for any runs since no schedules have been started.") return schedule_names = ", ".join([schedule.job_name for schedule in schedules]) logger.info( f"Checking for new runs for the following schedules: {schedule_names}") with RepositoryLocationHandleManager( grpc_server_registry) as handle_manager: for schedule_state in schedules: error_info = None try: origin = schedule_state.origin.external_repository_origin.repository_location_origin repo_location_handle = handle_manager.get_handle(origin) repo_location = repo_location_handle.create_location() yield from launch_scheduled_runs_for_schedule( instance, logger, schedule_state, repo_location, end_datetime_utc, max_catchup_runs, (debug_crash_flags.get(schedule_state.job_name) if debug_crash_flags else None), ) except Exception: # pylint: disable=broad-except error_info = serializable_error_info_from_exc_info( sys.exc_info()) logger.error( f"Scheduler caught an error for schedule {schedule_state.job_name} : {error_info.to_string()}" ) yield error_info
def _test_launch_sensor_runs_in_subprocess(instance_ref, execution_datetime, debug_crash_flags): with DagsterInstance.from_ref(instance_ref) as instance: try: with pendulum.test(execution_datetime), ProcessGrpcServerRegistry( wait_for_processes_on_exit=True ) as grpc_server_registry: with RepositoryLocationHandleManager(grpc_server_registry) as handle_manager: list( execute_sensor_iteration( instance, get_default_daemon_logger("SensorDaemon"), handle_manager, debug_crash_flags=debug_crash_flags, ) ) finally: cleanup_test_instance(instance)
def execute_sensor_iteration_loop( instance, grpc_server_registry, logger, daemon_shutdown_event, until=None ): """ Helper function that performs sensor evaluations on a tighter loop, while reusing grpc locations within a given daemon interval. Rather than relying on the daemon machinery to run the iteration loop every 30 seconds, sensors are continuously evaluated, every 5 seconds. We rely on each sensor definition's min_interval to check that sensor evaluations are spaced appropriately. """ from dagster.daemon.daemon import CompletedIteration handle_manager = None manager_loaded_time = None RELOAD_LOCATION_MANAGER_INTERVAL = 60 start_time = pendulum.now("UTC").timestamp() with ExitStack() as stack: while not daemon_shutdown_event or not daemon_shutdown_event.is_set(): start_time = pendulum.now("UTC").timestamp() if until and start_time >= until: # provide a way of organically ending the loop to support test environment break if ( not handle_manager or (start_time - manager_loaded_time) > RELOAD_LOCATION_MANAGER_INTERVAL ): stack.close() # remove the previous context handle_manager = stack.enter_context( RepositoryLocationHandleManager(grpc_server_registry) ) manager_loaded_time = start_time yield from execute_sensor_iteration(instance, logger, handle_manager) loop_duration = pendulum.now("UTC").timestamp() - start_time sleep_time = max(0, MIN_INTERVAL_LOOP_TIME - loop_duration) yield CompletedIteration() time.sleep(sleep_time)
def run_iteration(self, instance, grpc_server_registry): in_progress_runs = self._get_in_progress_runs(instance) max_runs_to_launch = self._max_concurrent_runs - len(in_progress_runs) # Possibly under 0 if runs were launched without queuing if max_runs_to_launch <= 0: self._logger.info( "{} runs are currently in progress. Maximum is {}, won't launch more." .format(len(in_progress_runs), self._max_concurrent_runs)) return queued_runs = self._get_queued_runs(instance) if not queued_runs: self._logger.info("Poll returned no queued runs.") else: self._logger.info( "Retrieved {} queued runs, checking limits.".format( len(queued_runs))) # place in order sorted_runs = self._priority_sort(queued_runs) # launch until blocked by limit rules num_dequeued_runs = 0 tag_concurrency_limits_counter = _TagConcurrencyLimitsCounter( self._tag_concurrency_limits, in_progress_runs) with RepositoryLocationHandleManager( grpc_server_registry) as location_manager: for run in sorted_runs: if num_dequeued_runs >= max_runs_to_launch: break if tag_concurrency_limits_counter.is_run_blocked(run): continue error_info = None try: self._dequeue_run(instance, run, location_manager) except Exception: # pylint: disable=broad-except error_info = serializable_error_info_from_exc_info( sys.exc_info()) message = ( f"Caught an error for run {run.run_id} while removing it from the queue." " Marking the run as failed and dropping it from the queue" ) message_with_full_error = f"{message}: {error_info.to_string()}" self._logger.error(message_with_full_error) instance.report_run_failed(run, message_with_full_error) # modify the original error, so that the extra message appears in heartbeats error_info = error_info._replace( message=f"{message}: {error_info.message}") else: tag_concurrency_limits_counter.update_counters_with_launched_run( run) num_dequeued_runs += 1 yield error_info self._logger.info("Launched {} runs.".format(num_dequeued_runs))