def test_09_join_pipeline_task(self): "Dummy test pipeline join" task = self.dag.get_task('join_pipeline_task') task_instance = models.TaskInstance(task=task, execution_date=datetime.now()) task.execute(task_instance.get_template_context()) assert 1 == 1
def test_clear_api(self): task = self.dag_bash.tasks[0] task.clear( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, upstream=True, downstream=True) ti = models.TaskInstance(task=task, execution_date=DEFAULT_DATE) ti.are_dependents_done()
def process_events(self, executor, dagbag): """ Respond to executor events. Used to identify queued tasks and schedule them for further processing. """ for key, executor_state in list(executor.get_event_buffer().items()): dag_id, task_id, execution_date = key if dag_id not in dagbag.dags: self.logger.error( 'Executor reported a dag_id that was not found in the ' 'DagBag: {}'.format(dag_id)) continue elif not dagbag.dags[dag_id].has_task(task_id): self.logger.error( 'Executor reported a task_id that was not found in the ' 'dag: {} in dag {}'.format(task_id, dag_id)) continue task = dagbag.dags[dag_id].get_task(task_id) ti = models.TaskInstance(task, execution_date) ti.refresh_from_db() if executor_state == State.SUCCESS: # collect queued tasks for prioritiztion if ti.state == State.QUEUED: self.queued_tis.add(ti) else: # special instructions for failed executions could go here pass
def setUp(self): test_setup(self, dag) # Execute start task as it is dummy task and does not need to be tested start_task = self.dag.get_task('start_task') task_instance = models.TaskInstance(task=start_task, execution_date=datetime.now()) start_task.execute(task_instance.get_template_context())
def test_06_weather_ingest_spark_task(self): "Test to confirm that ingest spark task ingests and persists data" local_path = get_local_path(self.weather_ingest_path) task = self.dag.get_task('spark_ingest_weatherdata') task_instance = models.TaskInstance(task=task, execution_date=datetime.now()) task.execute(task_instance.get_template_context()) assert check_dir_exists(local_path) == True verify_schema(local_path, 17, ['date', 'hum_avg'])
def test_07_weather_transform_cleanup_task(self): "Test to confirm that cleanup task removes directory if exists" local_path = get_local_path(self.weather_transform_path) task = self.dag.get_task('cleanup_transform_weatherdata') task_instance = models.TaskInstance(task=task, execution_date=datetime.now()) create_tmp_dir(local_path) task.execute(task_instance.get_template_context()) assert check_dir_exists(local_path) == False
def test_08_weather_transform_spark_task(self): "Test to confirm that spark task transforms and persists data" local_path = get_local_path(self.weather_transform_path) task = self.dag.get_task('spark_transform_weatherdata') task_instance = models.TaskInstance(task=task, execution_date=datetime.now()) task.execute(task_instance.get_template_context()) assert check_dir_exists(local_path) == True verify_schema(local_path, 19, ['dayofweek', 'humidity_range'])
def test_02_uber_ingest_spark_task(self): "Test to confirm that ingest spark task ingests and persists data" local_path = get_local_path(self.uber_ingest_path) task = self.dag.get_task('spark_ingest_uberdata') task_instance = models.TaskInstance(task=task, execution_date=datetime.now()) task.execute(task_instance.get_template_context()) assert check_dir_exists(local_path) == True verify_schema(local_path, 6, ['DATE', 'TIME', 'PICK_UP_ADDRESS'])
def test_10_datamart_cleanup_task(self): "Test to confirm that cleanup task removes directory if exists" local_path = get_local_path(self.app_path) task = self.dag.get_task( 'cleanup_uber_rides_by_himidity_uberridesbyhumidity') task_instance = models.TaskInstance(task=task, execution_date=datetime.now()) create_tmp_dir(local_path) task.execute(task_instance.get_template_context()) assert check_dir_exists(local_path) == False
def test_scheduler_verify_pool_full(self, mock_pool_full): """ Test task instances not queued when pool is full """ mock_pool_full.return_value = False dag = DAG( dag_id='test_scheduler_verify_pool_full', start_date=DEFAULT_DATE) DummyOperator( task_id='dummy', dag=dag, owner='airflow', pool='test_scheduler_verify_pool_full') session = settings.Session() pool = Pool(pool='test_scheduler_verify_pool_full', slots=1) session.add(pool) orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() scheduler = SchedulerJob() dag.clear() # Create 2 dagruns, which will create 2 task instances. dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) self.assertEquals(dr.execution_date, DEFAULT_DATE) dr = scheduler.create_dag_run(dag) self.assertIsNotNone(dr) queue = [] scheduler._process_task_instances(dag, queue=queue) self.assertEquals(len(queue), 2) dagbag = SimpleDagBag([dag]) # Recreated part of the scheduler here, to kick off tasks -> executor for ti_key in queue: task = dag.get_task(ti_key[1]) ti = models.TaskInstance(task, ti_key[2]) # Task starts out in the scheduled state. All tasks in the # scheduled state will be sent to the executor ti.state = State.SCHEDULED # Also save this task instance to the DB. session.merge(ti) session.commit() scheduler._execute_task_instances(dagbag, (State.SCHEDULED, State.UP_FOR_RETRY)) self.assertEquals(len(scheduler.executor.queued_tasks), 1)
def run_backfill(self, dag_id, task_id): dag = self.dagbag.get_dag(dag_id) dag.clear() BackfillJob(dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE).run() ti = models.TaskInstance(task=dag.get_task(task_id), execution_date=DEFAULT_DATE) ti.refresh_from_db() self.assertEqual(ti.state, State.SUCCESS)
def test_run_pooling_task(self): """ test that running task with mark_success param update task state as SUCCESS without running task. """ dag = models.DAG(dag_id='test_run_pooling_task') task = DummyOperator(task_id='test_run_pooling_task_op', dag=dag, pool='test_run_pooling_task_pool', owner='airflow', start_date=datetime.datetime(2016, 2, 1, 0, 0, 0)) ti = models.TaskInstance(task=task, execution_date=datetime.datetime.now()) ti.run() assert ti.state == models.State.QUEUED
def test_11_datamart_transform_spark_task(self): "Test to confirm that spark task joins and persists data" local_path = get_local_path(self.app_path) task = self.dag.get_task( 'spark_uber_rides_by_himidity_uberridesbyhumidity') task_instance = models.TaskInstance(task=task, execution_date=datetime.now()) task.execute(task_instance.get_template_context()) assert check_dir_exists(local_path) == True files_list = glob.glob(local_path + '/*.csv') assert len(files_list) == 1 reader = csv.DictReader(open(files_list[0])) schema = reader.fieldnames assert 'humidity_range' in schema assert 'count' in schema
def update_last_run(self): last_dag_run = self.last_run() if last_dag_run: dag_task_execution_date = self.previous_schedule( last_dag_run.execution_date) print dag_task_execution_date if dag_task_execution_date.date( ) != last_dag_run.execution_date.date(): dag_task_execution_date = datetime.combine( last_dag_run.execution_date.date(), dag_task_execution_date.time()) print dag_task_execution_date print last_dag_run.execution_date if dag_task_execution_date != last_dag_run.execution_date: session = settings.Session dag_re_schedule_run = models.DagRun( dag_id=self.dag_id, run_id='scheduled__' + dag_task_execution_date.isoformat(), execution_date=dag_task_execution_date, start_date=datetime.now(), end_date=datetime.now(), state=State.SUCCESS, external_trigger=False) session.add(dag_re_schedule_run) session.commit() for dag_task_id in self.task_ids: task_instance = models.TaskInstance( self.get_task(dag_task_id), execution_date=dag_task_execution_date, state=State.SUCCESS) task_instance.start_date = datetime.now() task_instance.end_date = datetime.now() session.add(task_instance) session.commit()
def get_rendered_template(task): """ Returns a rendered BigQuery SQL script. :param task: BigQueryOperator task that need to be rendered :type task: BigQueryOperator :return: list of templated fields from BigQueryOperator :rtype: list(str) """ # Added dags to the bql script path to create correct path if hasattr(task, 'sql'): task.sql = '/dags/' + task.sql if hasattr(task, 'bql'): task.bql = '/dags/' + task.bql dttm = datetime.datetime(2018, 10, 21, 0, 0, 0) ti = af_models.TaskInstance(task=task, execution_date=dttm) try: ti.render_templates() except Exception as e: raise Exception("Error rendering template: " + str(e)) return task.__class__.template_fields
def _execute(self): """ Runs a dag for a specified date range. """ start_date = self.bf_start_date end_date = self.bf_end_date session = settings.Session() pickle = models.DagPickle(self.dag, self) executor = self.executor executor.start() session.add(pickle) session.commit() pickle_id = pickle.id # Build a list of all intances to run tasks_to_run = {} failed = [] succeeded = [] started = [] wont_run = [] for task in self.dag.tasks: start_date = start_date or task.start_date end_date = end_date or task.end_date or datetime.now() for dttm in utils.date_range(start_date, end_date, task.dag.schedule_interval): ti = models.TaskInstance(task, dttm) tasks_to_run[ti.key] = ti # Triggering what is ready to get triggered while tasks_to_run: msg = ("Yet to run: {0} | " "Succeeded: {1} | " "Started: {2} | " "Failed: {3} | " "Won't run: {4} ").format(len(tasks_to_run), len(succeeded), len(started), len(failed), len(wont_run)) logging.info(msg) for key, ti in tasks_to_run.items(): ti.refresh_from_db() if ti.state == State.SUCCESS and key in tasks_to_run: succeeded.append(key) del tasks_to_run[key] elif ti.is_runnable(): executor.queue_command(key=ti.key, command=ti.command( mark_success=self.mark_success, pickle_id=pickle_id)) ti.state = State.RUNNING if key not in started: started.append(key) self.heartbeat() executor.heartbeat() # Reacting to events for key, state in executor.get_event_buffer().items(): dag_id, task_id, execution_date = key if key not in tasks_to_run: continue ti = tasks_to_run[key] ti.refresh_from_db() if ti.state == State.FAILED: failed.append(key) logging.error("Task instance " + str(key) + " failed") del tasks_to_run[key] # Removing downstream tasks from the one that has failed for t in self.dag.get_task(task_id).get_flat_relatives( upstream=False): key = (ti.dag_id, t.task_id, execution_date) if key in tasks_to_run: wont_run.append(key) del tasks_to_run[key] elif ti.state == State.SUCCESS: succeeded.append(key) del tasks_to_run[key] executor.end() logging.info("Run summary:") session.close()
def _execute(self): """ Runs a dag for a specified date range. """ session = settings.Session() start_date = self.bf_start_date end_date = self.bf_end_date # picklin' pickle_id = None if not self.donot_pickle and self.executor.__class__ not in ( executors.LocalExecutor, executors.SequentialExecutor): pickle = models.DagPickle(self.dag) session.add(pickle) session.commit() pickle_id = pickle.id executor = self.executor executor.start() executor_fails = Counter() # Build a list of all instances to run tasks_to_run = {} failed = set() succeeded = set() started = set() skipped = set() not_ready = set() deadlocked = set() for task in self.dag.tasks: if (not self.include_adhoc) and task.adhoc: continue start_date = start_date or task.start_date end_date = end_date or task.end_date or datetime.now() for dttm in self.dag.date_range(start_date, end_date=end_date): ti = models.TaskInstance(task, dttm) tasks_to_run[ti.key] = ti session.merge(ti) session.commit() # Triggering what is ready to get triggered while tasks_to_run and not deadlocked: not_ready.clear() for key, ti in list(tasks_to_run.items()): ti.refresh_from_db() ignore_depends_on_past = (self.ignore_first_depends_on_past and ti.execution_date == (start_date or ti.start_date)) # The task was already marked successful or skipped by a # different Job. Don't rerun it. if key not in started: if ti.state == State.SUCCESS: succeeded.add(key) tasks_to_run.pop(key) continue elif ti.state == State.SKIPPED: skipped.add(key) tasks_to_run.pop(key) continue # Is the task runnable? -- then run it if ti.is_queueable( include_queued=True, ignore_depends_on_past=ignore_depends_on_past, flag_upstream_failed=True): self.logger.debug('Sending {} to executor'.format(ti)) executor.queue_task_instance( ti, mark_success=self.mark_success, pickle_id=pickle_id, ignore_dependencies=self.ignore_dependencies, ignore_depends_on_past=ignore_depends_on_past, pool=self.pool) started.add(key) # Mark the task as not ready to run elif ti.state in (State.NONE, State.UPSTREAM_FAILED): not_ready.add(key) self.heartbeat() executor.heartbeat() # If the set of tasks that aren't ready ever equals the set of # tasks to run, then the backfill is deadlocked if not_ready and not_ready == set(tasks_to_run): deadlocked.update(tasks_to_run.values()) tasks_to_run.clear() # Reacting to events for key, state in list(executor.get_event_buffer().items()): dag_id, task_id, execution_date = key if key not in tasks_to_run: continue ti = tasks_to_run[key] ti.refresh_from_db() # executor reports failure if state == State.FAILED: # task reports running if ti.state == State.RUNNING: msg = ('Executor reports that task instance {} failed ' 'although the task says it is running.'.format( key)) self.logger.error(msg) ti.handle_failure(msg) tasks_to_run.pop(key) # task reports skipped elif ti.state == State.SKIPPED: self.logger.error("Skipping {} ".format(key)) skipped.add(key) tasks_to_run.pop(key) # anything else is a failure else: self.logger.error( "Task instance {} failed".format(key)) failed.add(key) tasks_to_run.pop(key) # executor reports success elif state == State.SUCCESS: # task reports success if ti.state == State.SUCCESS: self.logger.info( 'Task instance {} succeeded'.format(key)) succeeded.add(key) tasks_to_run.pop(key) # task reports failure elif ti.state == State.FAILED: self.logger.error( "Task instance {} failed".format(key)) failed.add(key) tasks_to_run.pop(key) # task reports skipped elif ti.state == State.SKIPPED: self.logger.info( "Task instance {} skipped".format(key)) skipped.add(key) tasks_to_run.pop(key) # this probably won't ever be triggered elif ti in not_ready: self.logger.info( "{} wasn't expected to run, but it did".format(ti)) # executor reports success but task does not - this is weird elif ti.state not in (State.SUCCESS, State.QUEUED, State.UP_FOR_RETRY): self.logger.error( "The airflow run command failed " "at reporting an error. This should not occur " "in normal circumstances. Task state is '{}'," "reported state is '{}'. TI is {}" "".format(ti.state, state, ti)) # if the executor fails 3 or more times, stop trying to # run the task executor_fails[key] += 1 if executor_fails[key] >= 3: msg = ( 'The airflow run command failed to report an ' 'error for task {} three or more times. The ' 'task is being marked as failed. This is very ' 'unusual and probably means that an error is ' 'taking place before the task even ' 'starts.'.format(key)) self.logger.error(msg) ti.handle_failure(msg) tasks_to_run.pop(key) msg = ' | '.join([ "[backfill progress]", "waiting: {0}", "succeeded: {1}", "kicked_off: {2}", "failed: {3}", "skipped: {4}", "deadlocked: {5}" ]).format(len(tasks_to_run), len(succeeded), len(started), len(failed), len(skipped), len(deadlocked)) self.logger.info(msg) executor.end() session.close() err = '' if failed: err += ("---------------------------------------------------\n" "Some task instances failed:\n{}\n".format(failed)) if deadlocked: err += ('---------------------------------------------------\n' 'BackfillJob is deadlocked.') deadlocked_depends_on_past = any( t.are_dependencies_met() != t.are_dependencies_met( ignore_depends_on_past=True) for t in deadlocked) if deadlocked_depends_on_past: err += ( 'Some of the deadlocked tasks were unable to run because ' 'of "depends_on_past" relationships. Try running the ' 'backfill with the option ' '"ignore_first_depends_on_past=True" or passing "-I" at ' 'the command line.') err += ' These tasks were unable to run:\n{}\n'.format(deadlocked) if err: raise AirflowException(err) self.logger.info("Backfill done. Exiting.")
def _execute(self): """ Runs a dag for a specified date range. """ session = settings.Session() start_date = self.bf_start_date end_date = self.bf_end_date # picklin' pickle_id = None if not self.donot_pickle and self.executor.__class__ not in ( executors.LocalExecutor, executors.SequentialExecutor): pickle = models.DagPickle(self.dag) session.add(pickle) session.commit() pickle_id = pickle.id executor = self.executor executor.start() # Build a list of all instances to run tasks_to_run = {} failed = [] succeeded = [] started = [] wont_run = [] for task in self.dag.tasks: if (not self.include_adhoc) and task.adhoc: continue start_date = start_date or task.start_date end_date = end_date or task.end_date or datetime.now() for dttm in utils.date_range( start_date, end_date, task.dag.schedule_interval): ti = models.TaskInstance(task, dttm) tasks_to_run[ti.key] = ti # Triggering what is ready to get triggered while tasks_to_run: for key, ti in tasks_to_run.items(): ti.refresh_from_db() if ti.state == State.SUCCESS and key in tasks_to_run: succeeded.append(key) del tasks_to_run[key] elif ti.is_runnable(): executor.queue_task_instance( ti, mark_success=self.mark_success, task_start_date=self.bf_start_date, pickle_id=pickle_id, ignore_dependencies=self.ignore_dependencies) ti.state = State.RUNNING if key not in started: started.append(key) self.heartbeat() executor.heartbeat() # Reacting to events for key, state in executor.get_event_buffer().items(): dag_id, task_id, execution_date = key if key not in tasks_to_run: continue ti = tasks_to_run[key] ti.refresh_from_db() if ti.state == State.FAILED: failed.append(key) logging.error("Task instance " + str(key) + " failed") del tasks_to_run[key] # Removing downstream tasks from the one that has failed for t in self.dag.get_task(task_id).get_flat_relatives( upstream=False): key = (ti.dag_id, t.task_id, execution_date) if key in tasks_to_run: wont_run.append(key) del tasks_to_run[key] elif ti.state == State.SUCCESS: succeeded.append(key) del tasks_to_run[key] msg = ( "[backfill progress] " "waiting: {0} | " "succeeded: {1} | " "kicked_off: {2} | " "failed: {3} | " "skipped: {4} ").format( len(tasks_to_run), len(succeeded), len(started), len(failed), len(wont_run)) logging.info(msg) executor.end() session.close() if failed: raise AirflowException( "Some tasks instances failed, here's the list:\n"+str(failed)) logging.info("All done. Exiting.")
def _execute(self): """ Runs a dag for a specified date range. """ session = settings.Session() start_date = self.bf_start_date end_date = self.bf_end_date # picklin' pickle_id = None if not self.donot_pickle and self.executor.__class__ not in ( executors.LocalExecutor, executors.SequentialExecutor): pickle = models.DagPickle(self.dag) session.add(pickle) session.commit() pickle_id = pickle.id executor = self.executor executor.start() # Build a list of all instances to run tasks_to_run = {} failed = [] succeeded = [] started = [] wont_run = [] for task in self.dag.tasks: if (not self.include_adhoc) and task.adhoc: continue start_date = start_date or task.start_date end_date = end_date or task.end_date or datetime.now() for dttm in self.dag.date_range(start_date, end_date=end_date): ti = models.TaskInstance(task, dttm) tasks_to_run[ti.key] = ti # Triggering what is ready to get triggered while tasks_to_run: for key, ti in list(tasks_to_run.items()): ti.refresh_from_db() if ti.state in (State.SUCCESS, State.SKIPPED) and key in tasks_to_run: succeeded.append(key) tasks_to_run.pop(key) elif ti.state in (State.RUNNING, State.QUEUED): continue elif ti.is_runnable(flag_upstream_failed=True): executor.queue_task_instance( ti, mark_success=self.mark_success, task_start_date=self.bf_start_date, pickle_id=pickle_id, ignore_dependencies=self.ignore_dependencies, pool=self.pool) ti.state = State.RUNNING if key not in started: started.append(key) self.heartbeat() executor.heartbeat() # Reacting to events for key, state in list(executor.get_event_buffer().items()): dag_id, task_id, execution_date = key if key not in tasks_to_run: continue ti = tasks_to_run[key] ti.refresh_from_db() if (ti.state in (State.FAILED, State.SKIPPED) or state == State.FAILED): if ti.state == State.FAILED or state == State.FAILED: failed.append(key) self.logger.error("Task instance " + str(key) + " failed") elif ti.state == State.SKIPPED: wont_run.append(key) self.logger.error("Skipping " + str(key) + " failed") tasks_to_run.pop(key) # Removing downstream tasks that also shouldn't run for t in self.dag.get_task(task_id).get_flat_relatives( upstream=False): key = (ti.dag_id, t.task_id, execution_date) if key in tasks_to_run: wont_run.append(key) tasks_to_run.pop(key) elif ti.state == State.SUCCESS and state == State.SUCCESS: succeeded.append(key) tasks_to_run.pop(key) elif (ti.state not in (State.SUCCESS, State.QUEUED) and state == State.SUCCESS): self.logger.error( "The airflow run command failed " "at reporting an error. This should not occur " "in normal circumstances. Task state is '{}'," "reported state is '{}'. TI is {}" "".format(ti.state, state, ti)) msg = ("[backfill progress] " "waiting: {0} | " "succeeded: {1} | " "kicked_off: {2} | " "failed: {3} | " "wont_run: {4} ").format(len(tasks_to_run), len(succeeded), len(started), len(failed), len(wont_run)) self.logger.info(msg) executor.end() session.close() if failed: msg = ("------------------------------------------\n" "Some tasks instances failed, " "here's the list:\n{}".format(failed)) raise AirflowException(msg) self.logger.info("All done. Exiting.")