def test_dag_run_event_manager(self): mailbox = Mailbox() event_manager = DagRunEventManager(mailbox) event_manager.start() self.create_task_state(dag_run=self._dag_run, task_id='operator_toggle_handler') event = BaseEvent("test_event", "test_event", namespace="default") event_manager.handle_event( DagRunId(self._dag_run.dag_id, self._dag_run.run_id), event) event_manager.handle_event( DagRunId(self._dag_run.dag_id, self._dag_run.run_id), event) handle_event = mailbox.get_message() message = EventHandleResult.from_event(handle_event) assert message == EventHandleResult( DagRunId(self._dag_run.dag_id, self._dag_run.run_id), "operator_toggle_handler", SchedulingAction.START) handle_event = mailbox.get_message() message = EventHandleResult.from_event(handle_event) assert message == EventHandleResult( DagRunId(self._dag_run.dag_id, self._dag_run.run_id), "operator_toggle_handler", SchedulingAction.STOP) time.sleep(2) event_manager.handle_event( DagRunId(self._dag_run.dag_id, self._dag_run.run_id), event) handle_event = mailbox.get_message() message = EventHandleResult.from_event(handle_event) assert message == EventHandleResult( DagRunId(self._dag_run.dag_id, self._dag_run.run_id), "operator_toggle_handler", SchedulingAction.START) event_manager.end()
def test_task_event_executor_runner(self): event = BaseEvent("test_event", "test_event", namespace="default") self.create_task_state(dag_run=self._dag_run, task_id='operator_toggle_handler') mailbox = Mailbox() executor_runner = DagRunEventExecutorRunner( mailbox, DagRunId(self._dag_run.dag_id, self._dag_run.run_id), 10) executor_runner.put_event(event) executor_runner.put_event(event) executor_runner.run() handle_event = mailbox.get_message() message = EventHandleResult.from_event( SchedulerInnerEventUtil.to_inner_event(handle_event)) assert message == EventHandleResult( DagRunId(self._dag_run.dag_id, self._dag_run.run_id), "operator_toggle_handler", SchedulingAction.START) handle_event = mailbox.get_message() message = EventHandleResult.from_event( SchedulerInnerEventUtil.to_inner_event(handle_event)) assert message == EventHandleResult( DagRunId(self._dag_run.dag_id, self._dag_run.run_id), "operator_toggle_handler", SchedulingAction.STOP)
def test_proccess_dag_file(self): mailbox = Mailbox() test_dag_path = os.path.join(TEST_DAG_FOLDER, 'test_event_scheduler_dags.py') dag_trigger = DagTrigger(subdir=test_dag_path, mailbox=mailbox, using_sqlite=True, num_runs=-1) dag_trigger.start() time.sleep(5) self.assertEqual(1, mailbox.length()) dag_trigger.stop()
def test_dag_trigger(self): mailbox = Mailbox() dag_trigger = DagTrigger(".", -1, [], False, mailbox) dag_trigger.start() type(self)._add_dag_needing_dagrun() message = mailbox.get_message() message = SchedulerInnerEventUtil.to_inner_event(message) assert message.dag_id == "test" dag_trigger.end()
def test_dag_run_event_manager_resubmit_if_exit_with_nonempty_queue(self): mailbox = Mailbox() event_manager = DagRunEventManager(mailbox, max_num_event=1) event_manager.start() self.create_task_state(dag_run=self._dag_run, task_id='operator_toggle_handler') event = BaseEvent("test_event", "test_event", namespace="default") event_manager.handle_event( DagRunId(self._dag_run.dag_id, self._dag_run.run_id), event) event_manager.handle_event( DagRunId(self._dag_run.dag_id, self._dag_run.run_id), event) assert mailbox.get_message() is not None assert mailbox.get_message_with_timeout(5) is not None
def test_dag_trigger_parse_dag(self): mailbox = Mailbox() dag_trigger = DagTrigger("../../dags/test_scheduler_dags.py", -1, [], False, mailbox) dag_trigger.start() message = mailbox.get_message() message = SchedulerInnerEventUtil.to_inner_event(message) # only one dag is executable assert "test_task_start_date_scheduling" == message.dag_id assert DagModel.get_dagmodel(dag_id="test_task_start_date_scheduling") is not None assert DagModel.get_dagmodel(dag_id="test_start_date_scheduling") is not None assert SerializedDagModel.get(dag_id="test_task_start_date_scheduling") is not None assert SerializedDagModel.get(dag_id="test_start_date_scheduling") is not None dag_trigger.end()
def test_dag_run_event_manager_release_runner(self): dag_run1 = self._dag_run _, dag_run2 = self.init_dag_and_dag_run( '../../dags/test_task_event_handler_dag.py', 'test_event_handler', timezone.datetime(2017, 1, 2)) self.create_task_state(dag_run1, 'operator_toggle_handler') self.create_task_state(dag_run2, 'operator_toggle_handler') event = BaseEvent("test_event", "test_event", namespace="default") mailbox = Mailbox() event_manager = DagRunEventManager(mailbox=mailbox) event_manager.handle_event(DagRunId(dag_run1.dag_id, dag_run1.run_id), event) time.sleep(5) event_manager.handle_event(DagRunId(dag_run1.dag_id, dag_run2.run_id), event) assert (DagRunId( dag_run2.dag_id, dag_run2.run_id)) in event_manager._event_executor_runners assert (DagRunId( dag_run1.dag_id, dag_run1.run_id)) not in event_manager._event_executor_runners event_manager.end()
def __init__(self, dag_directory, server_uri=None, max_runs=-1, refresh_dag_dir_interval=conf.getint( 'scheduler', 'refresh_dag_dir_interval', fallback=30), *args, **kwargs): super().__init__(*args, **kwargs) self.mailbox: Mailbox = Mailbox() self.dag_trigger: DagTrigger = DagTrigger( dag_directory=dag_directory, max_runs=max_runs, dag_ids=None, pickle_dags=False, mailbox=self.mailbox, refresh_dag_dir_interval=refresh_dag_dir_interval, notification_service_uri=server_uri) self.task_event_manager = DagRunEventManager(self.mailbox) self.executor.set_mailbox(self.mailbox) self.notification_client: NotificationClient = NotificationClient( server_uri=server_uri, default_namespace=SCHEDULER_NAMESPACE) self.scheduler: EventBasedScheduler = EventBasedScheduler( self.id, self.mailbox, self.task_event_manager, self.executor, self.notification_client) self.last_scheduling_id = self._last_scheduler_job_id()
def test_user_trigger_parse_dag(self): port = 50101 service_uri = 'localhost:{}'.format(port) storage = MemoryEventStorage() master = NotificationMaster(NotificationService(storage), port) master.run() mailbox = Mailbox() dag_trigger = DagTrigger("../../dags/test_scheduler_dags.py", -1, [], False, mailbox, 5, service_uri) dag_trigger.start() message = mailbox.get_message() message = SchedulerInnerEventUtil.to_inner_event(message) # only one dag is executable assert "test_task_start_date_scheduling" == message.dag_id sc = EventSchedulerClient(server_uri=service_uri, namespace='a') sc.trigger_parse_dag() dag_trigger.end() master.stop()
def test_dag_trigger_is_alive(self): mailbox = Mailbox() dag_trigger = DagTrigger(".", -1, [], False, mailbox) assert not dag_trigger.is_alive() dag_trigger.start() time.sleep(1) assert dag_trigger.is_alive() dag_trigger.end() assert not dag_trigger.is_alive()
def test_dag_run_event_manager_multiple_dag_runs(self): dag_run1 = self._dag_run _, dag_run2 = self.init_dag_and_dag_run( '../../dags/test_task_event_handler_dag.py', 'test_event_handler', timezone.datetime(2017, 1, 2)) self.create_task_state(dag_run1, 'operator_toggle_handler') self.create_task_state(dag_run2, 'operator_toggle_handler') event = BaseEvent("test_event", "test_event", namespace="default") mailbox = Mailbox() event_manager = DagRunEventManager(mailbox=mailbox) event_manager.handle_event(DagRunId(dag_run1.dag_id, dag_run1.run_id), event) event_manager.handle_event(DagRunId(dag_run2.dag_id, dag_run2.run_id), event) messages = [ EventHandleResult.from_event(mailbox.get_message()), EventHandleResult.from_event(mailbox.get_message()) ] assert EventHandleResult(DagRunId(dag_run1.dag_id, dag_run1.run_id), "operator_toggle_handler", SchedulingAction.START) in messages assert EventHandleResult(DagRunId(dag_run2.dag_id, dag_run2.run_id), "operator_toggle_handler", SchedulingAction.START) in messages event_manager.handle_event(DagRunId(dag_run1.dag_id, dag_run1.run_id), event) event_manager.handle_event(DagRunId(dag_run2.dag_id, dag_run2.run_id), event) messages = [ EventHandleResult.from_event(mailbox.get_message()), EventHandleResult.from_event(mailbox.get_message()) ] assert EventHandleResult(DagRunId(dag_run1.dag_id, dag_run1.run_id), "operator_toggle_handler", SchedulingAction.STOP) in messages assert EventHandleResult(DagRunId(dag_run2.dag_id, dag_run2.run_id), "operator_toggle_handler", SchedulingAction.STOP) in messages event_manager.end()
def test_add_task(self): mailbox = Mailbox() periodic_manager = PeriodicManager(mailbox) periodic_manager.start() periodic_manager.add_task('1', '1', {'cron': '*/1 * * * * * *'}) event = mailbox.get_message() periodic_manager.remove_task('1', '1') self.assertEqual('1', event.key) periodic_manager.add_task('2', '2', {'cron': '*/1 * * * * *'}) event = mailbox.get_message() self.assertEqual('2', event.key) periodic_manager.remove_task('2', '2') periodic_manager.add_task('3', '3', {'interval': '0,0,0,0,1'}) event = mailbox.get_message() self.assertEqual('3', event.key) periodic_manager.remove_task('3', '3') periodic_manager.shutdown()
def test_task_event_executor_runner_max_event(self): event = BaseEvent("test_event", "test_event", namespace="default") self.create_task_state(dag_run=self._dag_run, task_id='operator_toggle_handler') mailbox = Mailbox() executor_runner = DagRunEventExecutorRunner( mailbox, DagRunId(self._dag_run.dag_id, self._dag_run.run_id), 5) for i in range(10): executor_runner.put_event(event) executor_runner.run() messages = [] for i in range(5): messages.append(mailbox.get_message()) assert executor_runner._event_queue.qsize() == 5
def test_file_processor_manager_kill(self): mailbox = Mailbox() dag_trigger = DagTrigger(".", -1, [], False, mailbox) dag_trigger.start() dag_file_processor_manager_process = dag_trigger._dag_file_processor_agent._process dag_file_processor_manager_process.kill() dag_file_processor_manager_process.join(1) assert not dag_file_processor_manager_process.is_alive() time.sleep(5) dag_file_processor_manager_process = dag_trigger._dag_file_processor_agent._process assert dag_file_processor_manager_process.is_alive() dag_trigger.end()
def __init__(self, dag_id=None, dag_ids=None, subdir=settings.DAGS_FOLDER, num_runs=conf.getint('scheduler', 'num_runs', fallback=-1), processor_poll_interval=conf.getfloat( 'scheduler', 'processor_poll_interval', fallback=1), use_local_nf=conf.getboolean('scheduler', 'use_local_notification', fallback=True), nf_host=conf.get('scheduler', 'notification_host', fallback='localhost'), nf_port=conf.getint('scheduler', 'notification_port', fallback=50051), unit_test_mode=conf.getboolean('core', 'unit_test_mode', fallback=False), executor_heartbeat_interval=conf.getint( 'scheduler', 'executor_heartbeat_interval', fallback=2), run_duration=None, do_pickle=False, log=None, *args, **kwargs): super().__init__(dag_id, dag_ids, subdir, num_runs, processor_poll_interval, run_duration, do_pickle, log, *args, **kwargs) self.dag_trigger = None self.notification_master = None self.use_local_nf = use_local_nf self.nf_host = nf_host self.nf_port = nf_port self.mail_box = Mailbox() self.running = True self.dagrun_route = DagRunRoute() self.unit_test_mode = unit_test_mode self.executor_heartbeat_interval = executor_heartbeat_interval self.heartbeat_thread = None
def test_replay_message(self): key = "stop" mailbox = Mailbox() mailbox.set_scheduling_job_id(1234) watcher = SchedulerEventWatcher(mailbox) self.client.start_listen_events(watcher=watcher, start_time=int(time.time() * 1000), version=None) self.send_event(key) msg: BaseEvent = mailbox.get_message() self.assertEqual(msg.key, key) with create_session() as session: msg_from_db = session.query(Message).first() expect_non_unprocessed = EventBasedScheduler.get_unprocessed_message( 1000) self.assertEqual(0, len(expect_non_unprocessed)) unprocessed = EventBasedScheduler.get_unprocessed_message(1234) self.assertEqual(unprocessed[0].serialized_message, msg_from_db.data) deserialized_data = pickle.loads(msg_from_db.data) self.assertEqual(deserialized_data.key, key) self.assertEqual(msg, deserialized_data)
def test_add_task_invalidated(self): mailbox = Mailbox() periodic_manager = PeriodicManager(mailbox) periodic_manager.start() with self.assertRaises(Exception) as context: periodic_manager.add_task('1', '1', {'cron': '*/1 * * * *'}) self.assertTrue('The cron expression' in str(context.exception)) with self.assertRaises(Exception) as context: periodic_manager.add_task('2', '2', {'interval': '0,0,0,1'}) self.assertTrue('The interval expression' in str(context.exception)) periodic_manager.shutdown()
def test_trigger_parse_dag(self): import os port = 50102 server_uri = "localhost:{}".format(port) storage = MemoryEventStorage() master = NotificationMaster(NotificationService(storage), port) master.run() dag_folder = os.path.abspath(os.path.dirname(__file__)) + "/../../dags" mailbox = Mailbox() dag_trigger = DagTrigger(dag_folder, -1, [], False, mailbox, notification_service_uri=server_uri) dag_trigger.start() to_be_triggered = [dag_folder + "/test_event_based_scheduler.py", dag_folder + "/test_event_task_dag.py", dag_folder + "/test_event_based_executor.py", dag_folder + "/test_scheduler_dags.py", ] for file in to_be_triggered: self._send_request_and_receive_response(server_uri, file) dag_trigger.end()
class EventSchedulerJob(SchedulerJob): """ EventSchedulerJob: The scheduler driven by events. The scheduler get the message from notification service, then scheduling the tasks which affected by the events. """ __mapper_args__ = {'polymorphic_identity': 'EventSchedulerJob'} def __init__(self, dag_id=None, dag_ids=None, subdir=settings.DAGS_FOLDER, num_runs=conf.getint('scheduler', 'num_runs', fallback=-1), processor_poll_interval=conf.getfloat( 'scheduler', 'processor_poll_interval', fallback=1), use_local_nf=conf.getboolean('scheduler', 'use_local_notification', fallback=True), nf_host=conf.get('scheduler', 'notification_host', fallback='localhost'), nf_port=conf.getint('scheduler', 'notification_port', fallback=50051), unit_test_mode=conf.getboolean('core', 'unit_test_mode', fallback=False), executor_heartbeat_interval=conf.getint( 'scheduler', 'executor_heartbeat_interval', fallback=2), run_duration=None, do_pickle=False, log=None, *args, **kwargs): super().__init__(dag_id, dag_ids, subdir, num_runs, processor_poll_interval, run_duration, do_pickle, log, *args, **kwargs) self.dag_trigger = None self.notification_master = None self.use_local_nf = use_local_nf self.nf_host = nf_host self.nf_port = nf_port self.mail_box = Mailbox() self.running = True self.dagrun_route = DagRunRoute() self.unit_test_mode = unit_test_mode self.executor_heartbeat_interval = executor_heartbeat_interval self.heartbeat_thread = None @provide_session def _get_dag_runs(self, event, session): dag_runs = [] if EventType.is_in(event.event_type) and EventType( event.event_type) != EventType.UNDEFINED: if EventType(event.event_type) == EventType.DAG_RUN_EXECUTABLE: dag_run_id = int(event.key) dag_run = session.query(DagRun).filter( DagRun.id == dag_run_id).first() if dag_run is None: self.log.error("DagRun is None id {0}".format(dag_run_id)) return dag_runs simple_dag = event.simple_dag dag_run.pickle_id = None # create route self.dagrun_route.add_dagrun(dag_run, simple_dag, session) dag_runs.append(dag_run) elif EventType(event.event_type) == EventType.TASK_STATUS_CHANGED: dag_id, task_id, execution_date = TaskInstanceHelper.from_task_key( event.key) state, try_num = TaskInstanceHelper.from_event_value( event.value) dag_run = self.dagrun_route.find_dagrun(dag_id, execution_date) if dag_run is None: return dag_runs self._set_task_instance_state(dag_run, dag_id, task_id, execution_date, state, try_num) sync_dag_run = session.query(DagRun).filter( DagRun.id == dag_run.id).first() if sync_dag_run.state in State.finished(): self.log.info( "DagRun finished dag_id {0} execution_date {1} state {2}" .format(dag_run.dag_id, dag_run.execution_date, sync_dag_run.state)) if self.dagrun_route.find_dagrun_by_id( sync_dag_run.id) is not None: self.dagrun_route.remove_dagrun(dag_run, session) self.log.debug("Route remove dag run {0}".format( sync_dag_run.id)) self.mail_box.send_message( DagRunFinishedEvent(dag_run.id, sync_dag_run.state)) else: dag_runs.append(dag_run) elif EventType(event.event_type) == EventType.DAG_RUN_FINISHED: self.log.debug("DagRun {0} finished".format(event.key)) elif EventType(event.event_type) == EventType.STOP_SCHEDULER_CMD: if self.unit_test_mode: self.running = False return dag_runs else: runs = self.dagrun_route.find_dagruns_by_event( event_key=event.key, event_type=event.event_type) if runs is not None: for run in runs: task_deps = load_task_dependencies(dag_id=run.dag_id, session=session) tis = run.get_task_instances(session=session) for ti in tis: if ti.task_id not in task_deps: continue if (event.key, event.event_type) in task_deps[ti.task_id]: self.log.debug("{0} handle event {1}".format( ti.task_id, event)) ts = TaskState.query_task_state(ti, session=session) handler = ts.event_handler if handler is not None: action = handler.handle_event(event, ti=ti, ts=ts, session=session) ts.action = action session.merge(ts) session.commit() self.log.debug( "set task action {0} {1}".format( ti.task_id, action)) dag_runs.extend(runs) session.commit() for dag_run in dag_runs: run_process_func(target=process_tasks, args=( dag_run, self.dagrun_route.find_simple_dag(dag_run.id), self.log, )) return dag_runs @provide_session def _sync_event_to_db(self, event: Event, session=None): EventModel.sync_event(event=event, session=session) @provide_session def _run_event_loop(self, session=None): """ The main process event loop :param session: the connection of db session. :return: None """ while self.running: event: Event = self.mail_box.get_message() self.log.debug('EVENT: {0}'.format(event)) if not self.use_local_nf: self._sync_event_to_db(session) try: dag_runs = self._get_dag_runs(event) if dag_runs is None or len(dag_runs) == 0: continue # create SimpleDagBag simple_dags = [] for dag_run in dag_runs: simple_dags.append( self.dagrun_route.find_simple_dag( dagrun_id=dag_run.id)) simple_dag_bag = SimpleDagBag(simple_dags) if not self._validate_and_run_task_instances( simple_dag_bag=simple_dag_bag): continue except Exception as e: self.log.exception(str(e)) # scheduler end self.log.debug("_run_event_loop end") @provide_session def _init_route(self, session=None): """ Init the DagRunRoute object from db. :param session: :return: """ # running_dag_runs = session.query(DagRun).filter(DagRun.state == State.RUNNING).all() # for dag_run in running_dag_runs: # dag_model = session.query(DagModel).filter(DagModel.dag_id == dag_run.dag_id).first() # dagbag = models.DagBag(dag_model.fileloc) # dag_run.dag = dagbag.get_dag(dag_run.dag_id) # self.dagrun_route.add_dagrun(dag_run) # todo init route pass def _executor_heartbeat(self): while self.running: self.log.info("executor heartbeat...") self.executor.heartbeat() time.sleep(self.executor_heartbeat_interval) def _start_executor_heartbeat(self): self.heartbeat_thread = threading.Thread( target=self._executor_heartbeat, args=()) self.heartbeat_thread.setDaemon(True) self.heartbeat_thread.start() def _stop_executor_heartheat(self): self.running = False if self.heartbeat_thread is not None: self.heartbeat_thread.join() def _execute(self): """ 1. Init the DagRun route. 2. Start the executor. 3. Option of start the notification master. 4. Create the notification client. 5. Start the DagTrigger. 6. Run the scheduler event loop. :return: """ notification_client = None try: self._init_route() self.executor.set_use_nf(True) self.executor.start() self.dag_trigger = DagTrigger( subdir=self.subdir, mailbox=self.mail_box, run_duration=self.run_duration, using_sqlite=self.using_sqlite, num_runs=self.num_runs, processor_poll_interval=self._processor_poll_interval) if self.use_local_nf: self.notification_master \ = NotificationMaster(service=NotificationService(EventModelStorage()), port=self.nf_port) self.notification_master.run() self.log.info("start notification service {0}".format( self.nf_port)) notification_client = NotificationClient( server_uri="localhost:{0}".format(self.nf_port)) else: notification_client \ = NotificationClient(server_uri="{0}:{1}".format(self.nf_host, self.nf_port)) notification_client.start_listen_events( watcher=SCEventWatcher(self.mail_box)) self.dag_trigger.start() self._start_executor_heartbeat() self._run_event_loop() except Exception as e: self.log.exception("Exception when executing _execute {0}".format( str(e))) finally: self.running = False self._stop_executor_heartheat() if self.dag_trigger is not None: self.dag_trigger.stop() if notification_client is not None: notification_client.stop_listen_events() if self.notification_master is not None: self.notification_master.stop() self.executor.end() self.log.info("Exited execute event scheduler") @provide_session def _set_task_instance_state(self, dag_run, dag_id, task_id, execution_date, state, try_number, session=None): """ Set the task state to db and maybe set the dagrun object finished to db. :param dag_run: DagRun object :param dag_id: Dag identify :param task_id: task identify :param execution_date: the dag run execution date :param state: the task state should be set. :param try_number: the task try_number. :param session: :return: """ TI = models.TaskInstance qry = session.query(TI).filter(TI.dag_id == dag_id, TI.task_id == task_id, TI.execution_date == execution_date) ti = qry.first() if not ti: self.log.warning("TaskInstance %s went missing from the database", ti) return ts = TaskState.query_task_state(ti, session) self.log.debug( "set task state dag_id {0} task_id {1} execution_date {2} try_number {3} " "current try_number {4} state {5} ack_id {6} action {7}.".format( dag_id, task_id, execution_date, try_number, ti.try_number, state, ts.ack_id, ts.action)) is_restart = False if state == State.FAILED or state == State.SUCCESS or state == State.SHUTDOWN: if ti.try_number == try_number and ti.state == State.QUEUED: msg = ("Executor reports task instance {} finished ({}) " "although the task says its {}. Was the task " "killed externally?".format(ti, state, ti.state)) Stats.incr('scheduler.tasks.killed_externally') self.log.error(msg) try: dag = self.task_route.find_dagrun(dag_id, execution_date) ti.task = dag.get_task(task_id) ti.handle_failure(msg) except Exception: self.log.error( "Cannot load the dag bag to handle failure for %s" ". Setting task to FAILED without callbacks or " "retries. Do you have enough resources?", ti) ti.state = State.FAILED session.merge(ti) else: if ts.action is None: self.log.debug( "task dag_id {0} task_id {1} execution_date {2} action is None." .format(dag_id, task_id, execution_date)) elif TaskAction(ts.action) == TaskAction.RESTART: # if ts.stop_flag is not None and ts.stop_flag == try_number: ti.state = State.SCHEDULED ts.action = None ts.stop_flag = None ts.ack_id = 0 session.merge(ti) session.merge(ts) self.log.debug( "task dag_id {0} task_id {1} execution_date {2} try_number {3} restart action." .format(dag_id, task_id, execution_date, str(try_number))) is_restart = True elif TaskAction(ts.action) == TaskAction.STOP: # if ts.stop_flag is not None and ts.stop_flag == try_number: ts.action = None ts.stop_flag = None ts.ack_id = 0 session.merge(ts) self.log.debug( "task dag_id {0} task_id {1} execution_date {2} try_number {3} stop action." .format(dag_id, task_id, execution_date, str(try_number))) else: self.log.debug( "task dag_id {0} task_id {1} execution_date {2} action {3}." .format(dag_id, task_id, execution_date, ts.action)) session.commit() if not is_restart and ti.state == State.RUNNING: self.log.debug( "set task dag_id {0} task_id {1} execution_date {2} state {3}". format(dag_id, task_id, execution_date, state)) ti.state = state session.merge(ti) session.commit() # update dagrun state sync_dag_run = session.query(DagRun).filter( DagRun.id == dag_run.id).first() if sync_dag_run.state not in FINISHED_STATES: if self.dagrun_route.find_dagrun_by_id(sync_dag_run.id) is None: self.log.error( "DagRun lost dag_id {0} task_id {1} execution_date {2}". format(dag_id, task_id, execution_date)) else: run_process_func(target=dag_run_update_state, args=( dag_run, self.dagrun_route.find_simple_dag( dag_run.id), )) @provide_session def _create_task_instances(self, dag_run, session=None): """ This method schedules the tasks for a single DAG by looking at the active DAG runs and adding task instances that should run to the queue. """ # update the state of the previously active dag runs dag_runs = DagRun.find(dag_id=dag_run.dag_id, state=State.RUNNING, session=session) active_dag_runs = [] for run in dag_runs: self.log.info("Examining DAG run %s", run) # don't consider runs that are executed in the future unless # specified by config and schedule_interval is None if run.execution_date > timezone.utcnow( ) and not dag_run.dag.allow_future_exec_dates: self.log.error("Execution date is in future: %s", run.execution_date) continue if len(active_dag_runs) >= dag_run.dag.max_active_runs: self.log.info( "Number of active dag runs reached max_active_run.") break # skip backfill dagruns for now as long as they are not really scheduled if run.is_backfill: continue run.dag = dag_run.dag # todo: preferably the integrity check happens at dag collection time run.verify_integrity(session=session) run.update_state(session=session) if run.state == State.RUNNING: make_transient(run) active_dag_runs.append(run) def _process_dags_and_create_dagruns(self, dagbag, dags, dagrun_out): """ Iterates over the dags and processes them. Processing includes: 1. Create appropriate DagRun(s) in the DB. 2. Create appropriate TaskInstance(s) in the DB. 3. Send emails for tasks that have missed SLAs. :param dagbag: a collection of DAGs to process :type dagbag: airflow.models.DagBag :param dags: the DAGs from the DagBag to process :type dags: list[airflow.models.DAG] :param dagrun_out: A list to add DagRun objects :type dagrun_out: list[DagRun] :rtype: None """ for dag in dags: dag = dagbag.get_dag(dag.dag_id) if not dag: self.log.error("DAG ID %s was not found in the DagBag", dag.dag_id) continue if dag.is_paused: self.log.info("Not processing DAG %s since it's paused", dag.dag_id) continue self.log.info("Processing %s", dag.dag_id) dag_run = self.create_dag_run(dag) if dag_run: dag_run.dag = dag expected_start_date = dag.following_schedule( dag_run.execution_date) if expected_start_date: schedule_delay = dag_run.start_date - expected_start_date Stats.timing( 'dagrun.schedule_delay.{dag_id}'.format( dag_id=dag.dag_id), schedule_delay) self.log.info("Created %s", dag_run) self._create_task_instances(dag_run) self.log.info("Created tasks instances %s", dag_run) dagrun_out.append(dag_run) if conf.getboolean('core', 'CHECK_SLAS', fallback=True): self.manage_slas(dag) @provide_session def process_file(self, file_path, zombies, pickle_dags=False, session=None): """ Process a Python file containing Airflow DAGs. This includes: 1. Execute the file and look for DAG objects in the namespace. 2. Pickle the DAG and save it to the DB (if necessary). 3. For each DAG, see what tasks should run and create appropriate task instances in the DB. 4. Record any errors importing the file into ORM 5. Kill (in ORM) any task instances belonging to the DAGs that haven't issued a heartbeat in a while. Returns a list of SimpleDag objects that represent the DAGs found in the file :param file_path: the path to the Python file that should be executed :type file_path: unicode :param zombies: zombie task instances to kill. :type zombies: list[airflow.utils.dag_processing.SimpleTaskInstance] :param pickle_dags: whether serialize the DAGs found in the file and save them to the db :type pickle_dags: bool :return: a list of SimpleDagRuns made from the Dags found in the file :rtype: list[airflow.utils.dag_processing.SimpleDagBag] """ self.log.info("Processing file %s for tasks to queue", file_path) if session is None: session = settings.Session() # As DAGs are parsed from this file, they will be converted into SimpleDags try: dagbag = models.DagBag(file_path, include_examples=False) except Exception: self.log.exception("Failed at reloading the DAG file %s", file_path) Stats.incr('dag_file_refresh_error', 1, 1) return [], [] if len(dagbag.dags) > 0: self.log.info("DAG(s) %s retrieved from %s", dagbag.dags.keys(), file_path) else: self.log.warning("No viable dags retrieved from %s", file_path) self.update_import_errors(session, dagbag) return [], len(dagbag.import_errors) # Save individual DAGs in the ORM and update DagModel.last_scheduled_time for dag in dagbag.dags.values(): dag.sync_to_db() paused_dag_ids = [ dag.dag_id for dag in dagbag.dags.values() if dag.is_paused ] self.log.info("paused_dag_ids %s", paused_dag_ids) self.log.info("self %s", self.dag_ids) dag_to_pickle = {} # Pickle the DAGs (if necessary) and put them into a SimpleDag for dag_id in dagbag.dags: # Only return DAGs that are not paused if dag_id not in paused_dag_ids: dag = dagbag.get_dag(dag_id) pickle_id = None if pickle_dags: pickle_id = dag.pickle(session).id dag_to_pickle[dag.dag_id] = pickle_id if len(self.dag_ids) > 0: dags = [ dag for dag in dagbag.dags.values() if dag.dag_id in self.dag_ids and dag.dag_id not in paused_dag_ids ] else: dags = [ dag for dag in dagbag.dags.values() if not dag.parent_dag and dag.dag_id not in paused_dag_ids ] # Not using multiprocessing.Queue() since it's no longer a separate # process and due to some unusual behavior. (empty() incorrectly # returns true as described in https://bugs.python.org/issue23582 ) self.log.info("dags %s", dags) dag_run_out = [] self._process_dags_and_create_dagruns(dagbag, dags, dag_run_out) self.log.info("dag run out %s", len(dag_run_out)) simple_dag_runs = [] for dag_run in dag_run_out: simple_dag_runs.append( SimpleDagRun(dag_run.id, SimpleDag(dag_run.dag))) # commit batch session.commit() # Record import errors into the ORM try: self.update_import_errors(session, dagbag) except Exception: self.log.exception("Error logging import errors!") try: dagbag.kill_zombies(zombies) except Exception: self.log.exception("Error killing zombies!") return simple_dag_runs, len(dagbag.import_errors)