def run(args): utils.pessimistic_connection_handling() # Setting up logging log = os.path.expanduser(configuration.get('core', 'BASE_LOG_FOLDER')) directory = log + "/{args.dag_id}/{args.task_id}".format(args=args) if not os.path.exists(directory): os.makedirs(directory) args.execution_date = dateutil.parser.parse(args.execution_date) iso = args.execution_date.isoformat() filename = "{directory}/{iso}".format(**locals()) # store old log (to help with S3 appends) if os.path.exists(filename): with open(filename, 'r') as logfile: old_log = logfile.read() else: old_log = None subdir = process_subdir(args.subdir) logging.root.handlers = [] logging.basicConfig(filename=filename, level=settings.LOGGING_LEVEL, format=settings.LOG_FORMAT) if not args.pickle: dagbag = DagBag(subdir) if args.dag_id not in dagbag.dags: msg = 'DAG [{0}] could not be found in {1}'.format( args.dag_id, subdir) logging.error(msg) raise AirflowException(msg) dag = dagbag.dags[args.dag_id] task = dag.get_task(task_id=args.task_id) else: session = settings.Session() logging.info('Loading pickle id {args.pickle}'.format(**locals())) dag_pickle = session.query(DagPickle).filter( DagPickle.id == args.pickle).first() if not dag_pickle: raise AirflowException("Who hid the pickle!? [missing pickle]") dag = dag_pickle.pickle task = dag.get_task(task_id=args.task_id) task_start_date = None if args.task_start_date: task_start_date = dateutil.parser.parse(args.task_start_date) task.start_date = task_start_date ti = TaskInstance(task, args.execution_date) if args.local: print("Logging into: " + filename) run_job = jobs.LocalTaskJob( task_instance=ti, mark_success=args.mark_success, force=args.force, pickle_id=args.pickle, task_start_date=task_start_date, ignore_dependencies=args.ignore_dependencies, pool=args.pool) run_job.run() elif args.raw: ti.run( mark_success=args.mark_success, force=args.force, ignore_dependencies=args.ignore_dependencies, job_id=args.job_id, pool=args.pool, ) else: pickle_id = None if args.ship_dag: try: # Running remotely, so pickling the DAG session = settings.Session() pickle = DagPickle(dag) session.add(pickle) session.commit() pickle_id = pickle.id print(('Pickled dag {dag} ' 'as pickle_id:{pickle_id}').format(**locals())) except Exception as e: print('Could not pickle the DAG') print(e) raise e executor = DEFAULT_EXECUTOR executor.start() print("Sending to executor.") executor.queue_task_instance( ti, mark_success=args.mark_success, pickle_id=pickle_id, ignore_dependencies=args.ignore_dependencies, force=args.force) executor.heartbeat() executor.end() if configuration.get('core', 'S3_LOG_FOLDER').startswith('s3:'): import boto s3_log = filename.replace(log, configuration.get('core', 'S3_LOG_FOLDER')) bucket, key = s3_log.lstrip('s3:/').split('/', 1) if os.path.exists(filename): # get logs with open(filename, 'r') as logfile: new_log = logfile.read() # remove old logs (since they are already in S3) if old_log: new_log.replace(old_log, '') try: s3 = boto.connect_s3() s3_key = boto.s3.key.Key(s3.get_bucket(bucket), key) # append new logs to old S3 logs, if available if s3_key.exists(): old_s3_log = s3_key.get_contents_as_string().decode() new_log = old_s3_log + '\n' + new_log # send log to S3 encrypt = configuration.get('core', 'ENCRYPT_S3_LOGS') s3_key.set_contents_from_string(new_log, encrypt_key=encrypt) except: print('Could not send logs to S3.')
def process_dag(self, dag, executor): """ This method schedules a single DAG by looking at the latest run for each task and attempting to schedule the following run. As multiple schedulers may be running for redundancy, this function takes a lock on the DAG and timestamps the last run in ``last_scheduler_run``. """ TI = models.TaskInstance DagModel = models.DagModel session = settings.Session() # picklin' pickle_id = None if self.do_pickle and self.executor.__class__ not in ( executors.LocalExecutor, executors.SequentialExecutor): pickle_id = dag.pickle(session).id db_dag = session.query(DagModel).filter_by(dag_id=dag.dag_id).first() last_scheduler_run = db_dag.last_scheduler_run or datetime(2000, 1, 1) secs_since_last = ( datetime.now() - last_scheduler_run).total_seconds() # if db_dag.scheduler_lock or if secs_since_last < self.heartrate: session.commit() session.close() return None else: # Taking a lock db_dag.scheduler_lock = True db_dag.last_scheduler_run = datetime.now() session.commit() active_runs = dag.get_active_runs() self.logger.info('Getting list of tasks to skip for active runs.') skip_tis = set() if active_runs: qry = ( session.query(TI.task_id, TI.execution_date) .filter( TI.dag_id == dag.dag_id, TI.execution_date.in_(active_runs), TI.state.in_((State.RUNNING, State.SUCCESS, State.FAILED)), ) ) skip_tis = {(ti[0], ti[1]) for ti in qry.all()} descartes = [obj for obj in product(dag.tasks, active_runs)] self.logger.info('Checking dependencies on {} tasks instances, minus {} ' 'skippable ones'.format(len(descartes), len(skip_tis))) for task, dttm in descartes: if task.adhoc or (task.task_id, dttm) in skip_tis: continue ti = TI(task, dttm) ti.refresh_from_db() if ti.state in ( State.RUNNING, State.QUEUED, State.SUCCESS, State.FAILED): continue elif ti.is_runnable(flag_upstream_failed=True): self.logger.debug('Firing task: {}'.format(ti)) executor.queue_task_instance(ti, pickle_id=pickle_id) # Releasing the lock self.logger.debug("Unlocking DAG (scheduler_lock)") db_dag = ( session.query(DagModel) .filter(DagModel.dag_id == dag.dag_id) .first() ) db_dag.scheduler_lock = False session.merge(db_dag) session.commit() session.close()
"age_check_column": Task.date_done, "keep_last": False, "keep_last_filters": None, "keep_last_group_by": None }, { "airflow_db_model": TaskSet, "age_check_column": TaskSet.date_done, "keep_last": False, "keep_last_filters": None, "keep_last_group_by": None })) except Exception as e: logging.error(e) session = settings.Session() default_args = { "owner": DAG_OWNER_NAME, "depends_on_past": False, "email": ALERT_EMAIL_ADDRESSES, "email_on_failure": True, "email_on_retry": False, "start_date": START_DATE, "retries": 1, "retry_delay": timedelta(minutes=1) } dag = DAG(DAG_ID, default_args=default_args, schedule_interval=SCHEDULE_INTERVAL,
def run(args, dag=None): db_utils.pessimistic_connection_handling() if dag: args.dag_id = dag.dag_id # Setting up logging log_base = os.path.expanduser(conf.get('core', 'BASE_LOG_FOLDER')) directory = log_base + "/{args.dag_id}/{args.task_id}".format(args=args) if not os.path.exists(directory): os.makedirs(directory) iso = args.execution_date.isoformat() filename = "{directory}/{iso}".format(**locals()) logging.root.handlers = [] logging.basicConfig(filename=filename, level=settings.LOGGING_LEVEL, format=settings.LOG_FORMAT) if not args.pickle and not dag: dag = get_dag(args) elif not dag: session = settings.Session() logging.info('Loading pickle id {args.pickle}'.format(**locals())) dag_pickle = session.query(DagPickle).filter( DagPickle.id == args.pickle).first() if not dag_pickle: raise AirflowException("Who hid the pickle!? [missing pickle]") dag = dag_pickle.pickle task = dag.get_task(task_id=args.task_id) ti = TaskInstance(task, args.execution_date) if args.local: print("Logging into: " + filename) run_job = jobs.LocalTaskJob( task_instance=ti, mark_success=args.mark_success, force=args.force, pickle_id=args.pickle, ignore_dependencies=args.ignore_dependencies, ignore_depends_on_past=args.ignore_depends_on_past, pool=args.pool) run_job.run() elif args.raw: ti.run( mark_success=args.mark_success, force=args.force, ignore_dependencies=args.ignore_dependencies, ignore_depends_on_past=args.ignore_depends_on_past, job_id=args.job_id, pool=args.pool, ) else: pickle_id = None if args.ship_dag: try: # Running remotely, so pickling the DAG session = settings.Session() pickle = DagPickle(dag) session.add(pickle) session.commit() pickle_id = pickle.id print(('Pickled dag {dag} ' 'as pickle_id:{pickle_id}').format(**locals())) except Exception as e: print('Could not pickle the DAG') print(e) raise e executor = DEFAULT_EXECUTOR executor.start() print("Sending to executor.") executor.queue_task_instance( ti, mark_success=args.mark_success, pickle_id=pickle_id, ignore_dependencies=args.ignore_dependencies, ignore_depends_on_past=args.ignore_depends_on_past, force=args.force, pool=args.pool) executor.heartbeat() executor.end() # store logs remotely remote_base = conf.get('core', 'REMOTE_BASE_LOG_FOLDER') # deprecated as of March 2016 if not remote_base and conf.get('core', 'S3_LOG_FOLDER'): warnings.warn( 'The S3_LOG_FOLDER conf key has been replaced by ' 'REMOTE_BASE_LOG_FOLDER. Your conf still works but please ' 'update airflow.cfg to ensure future compatibility.', DeprecationWarning) remote_base = conf.get('core', 'S3_LOG_FOLDER') if os.path.exists(filename): # read log and remove old logs to get just the latest additions with open(filename, 'r') as logfile: log = logfile.read() remote_log_location = filename.replace(log_base, remote_base) # S3 if remote_base.startswith('s3:/'): logging_utils.S3Log().write(log, remote_log_location) # GCS elif remote_base.startswith('gs:/'): logging_utils.GCSLog().write(log, remote_log_location, append=True) # Other elif remote_base and remote_base != 'None': logging.error( 'Unsupported remote log location: {}'.format(remote_base))
def setUp(self): self.session = settings.Session() self.dagbag = models.DagBag(include_examples=True) self.dag_id = 'example_bash_operator' self.dag = self.dagbag.dags[self.dag_id]
def test_dags_clear(self): # setup session = settings.Session() dags, tis = [], [] num_of_dags = 5 for i in range(num_of_dags): dag = DAG('test_dag_clear_' + str(i), start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=10)) ti = TI(task=DummyOperator(task_id='test_task_clear_' + str(i), owner='test', dag=dag), execution_date=DEFAULT_DATE) dags.append(dag) tis.append(ti) # test clear all dags for i in range(num_of_dags): tis[i].run() self.assertEqual(tis[i].state, State.SUCCESS) self.assertEqual(tis[i].try_number, 2) self.assertEqual(tis[i].max_tries, 0) DAG.clear_dags(dags) for i in range(num_of_dags): tis[i].refresh_from_db() self.assertEqual(tis[i].state, State.NONE) self.assertEqual(tis[i].try_number, 2) self.assertEqual(tis[i].max_tries, 1) # test dry_run for i in range(num_of_dags): tis[i].run() self.assertEqual(tis[i].state, State.SUCCESS) self.assertEqual(tis[i].try_number, 3) self.assertEqual(tis[i].max_tries, 1) DAG.clear_dags(dags, dry_run=True) for i in range(num_of_dags): tis[i].refresh_from_db() self.assertEqual(tis[i].state, State.SUCCESS) self.assertEqual(tis[i].try_number, 3) self.assertEqual(tis[i].max_tries, 1) # test only_failed from random import randint failed_dag_idx = randint(0, len(tis) - 1) tis[failed_dag_idx].state = State.FAILED session.merge(tis[failed_dag_idx]) session.commit() DAG.clear_dags(dags, only_failed=True) for i in range(num_of_dags): tis[i].refresh_from_db() if i != failed_dag_idx: self.assertEqual(tis[i].state, State.SUCCESS) self.assertEqual(tis[i].try_number, 3) self.assertEqual(tis[i].max_tries, 1) else: self.assertEqual(tis[i].state, State.NONE) self.assertEqual(tis[i].try_number, 3) self.assertEqual(tis[i].max_tries, 2)
def clear_missing_dags_fn(**context): logging.info("Starting to run Clear Process") try: host_name = socket.gethostname() host_ip = socket.gethostbyname(host_name) logging.info("Running on Machine with Host Name: " + host_name) logging.info("Running on Machine with IP: " + host_ip) except Exception as e: print("Unable to get Host Name and IP: " + str(e)) session = settings.Session() logging.info("Configurations:") logging.info("enable_delete: " + str(ENABLE_DELETE)) logging.info("session: " + str(session)) logging.info("") dags = session.query(DagModel).all() entries_to_delete = [] for dag in dags: # Check if it is a zip-file if dag.fileloc is not None and '.zip/' in dag.fileloc: index = dag.fileloc.rfind('.zip/') + len('.zip') fileloc = dag.fileloc[0:index] else: fileloc = dag.fileloc if fileloc is None: logging.info( "After checking DAG '" + str(dag) + "', the fileloc was set to None so assuming the Python " + "definition file DOES NOT exist" ) entries_to_delete.append(dag) elif not os.path.exists(fileloc): logging.info( "After checking DAG '" + str(dag) + "', the Python definition file DOES NOT exist: " + fileloc ) entries_to_delete.append(dag) else: logging.info( "After checking DAG '" + str(dag) + "', the Python definition file does exist: " + fileloc ) logging.info("Process will be Deleting the DAG(s) from the DB:") for entry in entries_to_delete: logging.info("\tEntry: " + str(entry)) logging.info( "Process will be Deleting " + str(len(entries_to_delete)) + " DAG(s)" ) if ENABLE_DELETE: logging.info("Performing Delete...") for entry in entries_to_delete: session.delete(entry) logging.info("Finished Performing Delete") else: logging.warn("You're opted to skip deleting the DAG entries!!!") logging.info("Finished Running Clear Process")
def schedule_dag(self, dag): """ This method checks whether a new DagRun needs to be created for a DAG based on scheduling interval Returns DagRun if one is scheduled. Otherwise returns None. """ if dag.schedule_interval: DagRun = models.DagRun session = settings.Session() active_runs = DagRun.find( dag_id=dag.dag_id, state=State.RUNNING, external_trigger=False, session=session ) if len(active_runs) >= dag.max_active_runs: return for dr in active_runs: if ( dr.start_date and dag.dagrun_timeout and dr.start_date < datetime.now() - dag.dagrun_timeout): dr.state = State.FAILED dr.end_date = datetime.now() session.commit() # this query should be replaced by find dagrun qry = ( session.query(func.max(DagRun.execution_date)) .filter_by(dag_id=dag.dag_id) .filter(or_( DagRun.external_trigger == False, # add % as a wildcard for the like query DagRun.run_id.like(DagRun.ID_PREFIX+'%') )) ) last_scheduled_run = qry.scalar() # don't schedule @once again if dag.schedule_interval == '@once' and last_scheduled_run: return None next_run_date = None if not last_scheduled_run: # First run task_start_dates = [t.start_date for t in dag.tasks] if task_start_dates: next_run_date = min(task_start_dates) else: next_run_date = dag.following_schedule(last_scheduled_run) # don't ever schedule prior to the dag's start_date if dag.start_date: next_run_date = dag.start_date if not next_run_date else max(next_run_date, dag.start_date) # this structure is necessary to avoid a TypeError from concatenating # NoneType if dag.schedule_interval == '@once': period_end = next_run_date elif next_run_date: period_end = dag.following_schedule(next_run_date) # Don't schedule a dag beyond its end_date (as specified by the dag param) if next_run_date and dag.end_date and next_run_date > dag.end_date: return # Don't schedule a dag beyond its end_date (as specified by the task params) # Get the min task end date, which may come from the dag.default_args min_task_end_date = [] task_end_dates = [t.end_date for t in dag.tasks if t.end_date] if task_end_dates: min_task_end_date = min(task_end_dates) if next_run_date and min_task_end_date and next_run_date > min_task_end_date: return if next_run_date and period_end and period_end <= datetime.now(): next_run = dag.create_dagrun( run_id='scheduled__' + next_run_date.isoformat(), execution_date=next_run_date, start_date=datetime.now(), state=State.RUNNING, external_trigger=False ) return next_run
def process_dag(self, dag, queue): """ This method schedules a single DAG by looking at the latest run for each task and attempting to schedule the following run. As multiple schedulers may be running for redundancy, this function takes a lock on the DAG and timestamps the last run in ``last_scheduler_run``. """ DagModel = models.DagModel session = settings.Session() # picklin' pickle_id = None if self.do_pickle and self.executor.__class__ not in ( executors.LocalExecutor, executors.SequentialExecutor): pickle_id = dag.pickle(session).id # obtain db lock db_dag = session.query(DagModel).filter_by( dag_id=dag.dag_id ).with_for_update().one() last_scheduler_run = db_dag.last_scheduler_run or datetime(2000, 1, 1) secs_since_last = (datetime.now() - last_scheduler_run).total_seconds() if secs_since_last < self.heartrate: # release db lock session.commit() session.close() return None # Release the db lock # the assumption here is that process_dag will take less # time than self.heartrate otherwise we might unlock too # quickly and this should moved below, but that would increase # the time the record is locked and is blocking for other calls. db_dag.last_scheduler_run = datetime.now() session.commit() # update the state of the previously active dag runs dag_runs = DagRun.find(dag_id=dag.dag_id, state=State.RUNNING, session=session) active_dag_runs = [] for run in dag_runs: # do not consider runs that are executed in the future if run.execution_date > datetime.now(): continue # todo: run.task is transient but needs to be set run.dag = dag # todo: preferably the integrity check happens at dag collection time run.verify_integrity(session=session) run.update_state(session=session) if run.state == State.RUNNING: active_dag_runs.append(run) for run in active_dag_runs: # this needs a fresh session sometimes tis get detached tis = run.get_task_instances(state=(State.NONE, State.UP_FOR_RETRY)) # this loop is quite slow as it uses are_dependencies_met for # every task (in ti.is_runnable). This is also called in # update_state above which has already checked these tasks for ti in tis: task = dag.get_task(ti.task_id) # fixme: ti.task is transient but needs to be set ti.task = task # future: remove adhoc if task.adhoc: continue if ti.is_runnable(flag_upstream_failed=True): self.logger.debug('Queuing task: {}'.format(ti)) ti.refresh_from_db(session=session, lock_for_update=True) # another scheduler could have picked this task # todo: UP_FOR_RETRY still could create a race condition if ti.state is State.SCHEDULED: session.commit() self.logger.debug("Task {} was picked up by another scheduler" .format(ti)) continue elif ti.state is State.NONE: ti.state = State.SCHEDULED session.merge(ti) session.commit() queue.put((ti.key, pickle_id)) session.close()
def setUpClass(cls): super(TestLocalClient, cls).setUpClass() session = settings.Session() session.query(models.Pool).delete() session.commit() session.close()
def setUp(self): super(TestLocalClient, self).setUp() self.client = Client(api_base_url=None, auth=None) self.session = settings.Session()
def initdb(): session = settings.Session() from airflow import models upgradedb() merge_conn( models.Connection(conn_id='airflow_db', conn_type='mysql', host='localhost', login='******', password='', schema='airflow')) merge_conn( models.Connection(conn_id='airflow_ci', conn_type='mysql', host='localhost', login='******', schema='airflow_ci')) merge_conn( models.Connection(conn_id='beeline_default', conn_type='beeline', port="10000", host='localhost', extra="{\"use_beeline\": true, \"auth\": \"\"}", schema='default')) merge_conn( models.Connection(conn_id='bigquery_default', conn_type='bigquery')) merge_conn( models.Connection(conn_id='local_mysql', conn_type='mysql', host='localhost', login='******', password='******', schema='airflow')) merge_conn( models.Connection(conn_id='presto_default', conn_type='presto', host='localhost', schema='hive', port=3400)) merge_conn( models.Connection( conn_id='hive_cli_default', conn_type='hive_cli', schema='default', )) merge_conn( models.Connection(conn_id='hiveserver2_default', conn_type='hiveserver2', host='localhost', schema='default', port=10000)) merge_conn( models.Connection(conn_id='metastore_default', conn_type='hive_metastore', host='localhost', extra="{\"authMechanism\": \"PLAIN\"}", port=9083)) merge_conn( models.Connection(conn_id='mysql_default', conn_type='mysql', login='******', host='localhost')) merge_conn( models.Connection(conn_id='postgres_default', conn_type='postgres', login='******', schema='airflow', host='localhost')) merge_conn( models.Connection(conn_id='sqlite_default', conn_type='sqlite', host='/tmp/sqlite_default.db')) merge_conn( models.Connection(conn_id='http_default', conn_type='http', host='https://www.google.com/')) merge_conn( models.Connection(conn_id='mssql_default', conn_type='mssql', host='localhost', port=1433)) merge_conn( models.Connection(conn_id='vertica_default', conn_type='vertica', host='localhost', port=5433)) merge_conn( models.Connection(conn_id='webhdfs_default', conn_type='hdfs', host='localhost', port=50070)) merge_conn( models.Connection(conn_id='ssh_default', conn_type='ssh', host='localhost')) merge_conn( models.Connection(conn_id='fs_default', conn_type='fs', extra='{"path": "/"}')) merge_conn( models.Connection(conn_id='aws_default', conn_type='aws', extra='{"region_name": "us-east-1"}')) merge_conn( models.Connection(conn_id='emr_default', conn_type='emr', extra=''' { "Name": "default_job_flow_name", "LogUri": "s3://my-emr-log-bucket/default_job_flow_location", "ReleaseLabel": "emr-4.6.0", "Instances": { "InstanceGroups": [ { "Name": "Master nodes", "Market": "ON_DEMAND", "InstanceRole": "MASTER", "InstanceType": "r3.2xlarge", "InstanceCount": 1 }, { "Name": "Slave nodes", "Market": "ON_DEMAND", "InstanceRole": "CORE", "InstanceType": "r3.2xlarge", "InstanceCount": 1 } ] }, "Ec2KeyName": "mykey", "KeepJobFlowAliveWhenNoSteps": false, "TerminationProtected": false, "Ec2SubnetId": "somesubnet", "Applications":[ { "Name": "Spark" } ], "VisibleToAllUsers": true, "JobFlowRole": "EMR_EC2_DefaultRole", "ServiceRole": "EMR_DefaultRole", "Tags": [ { "Key": "app", "Value": "analytics" }, { "Key": "environment", "Value": "development" } ] } ''')) # Known event types KET = models.KnownEventType if not session.query(KET).filter(KET.know_event_type == 'Holiday').first(): session.add(KET(know_event_type='Holiday')) if not session.query(KET).filter(KET.know_event_type == 'Outage').first(): session.add(KET(know_event_type='Outage')) if not session.query(KET).filter( KET.know_event_type == 'Natural Disaster').first(): session.add(KET(know_event_type='Natural Disaster')) if not session.query(KET).filter( KET.know_event_type == 'Marketing Campaign').first(): session.add(KET(know_event_type='Marketing Campaign')) session.commit() dagbag = models.DagBag() # Save individual DAGs in the ORM now = datetime.utcnow() for dag in dagbag.dags.values(): models.DAG.sync_to_db(dag, dag.owner, now) # Deactivate the unknown ones models.DAG.deactivate_unknown_dags(dagbag.dags.keys()) Chart = models.Chart chart_label = "Ether task instance by type" chart = session.query(Chart).filter(Chart.label == chart_label).first() if not chart: chart = Chart( label=chart_label, conn_id='airflow_db', chart_type='bar', x_is_date=False, sql=("SELECT state, COUNT(1) as number " "FROM task_instance " "WHERE dag_id LIKE 'example%' " "GROUP BY state"), ) session.add(chart) session.commit()
def _execute(self): """ Runs a dag for a specified date range. """ session = settings.Session() start_date = self.bf_start_date end_date = self.bf_end_date # picklin' pickle_id = None if not self.donot_pickle and self.executor.__class__ not in ( executors.LocalExecutor, executors.SequentialExecutor): pickle = models.DagPickle(self.dag) session.add(pickle) session.commit() pickle_id = pickle.id executor = self.executor executor.start() # Build a list of all instances to run tasks_to_run = {} failed = [] succeeded = [] started = [] wont_run = [] for task in self.dag.tasks: if (not self.include_adhoc) and task.adhoc: continue start_date = start_date or task.start_date end_date = end_date or task.end_date or datetime.now() for dttm in utils.date_range(start_date, end_date, task.dag.schedule_interval): ti = models.TaskInstance(task, dttm) tasks_to_run[ti.key] = ti # Triggering what is ready to get triggered while tasks_to_run: for key, ti in tasks_to_run.items(): ti.refresh_from_db() if ti.state == State.SUCCESS and key in tasks_to_run: succeeded.append(key) del tasks_to_run[key] elif ti.is_runnable(): executor.queue_task_instance( ti, mark_success=self.mark_success, task_start_date=self.bf_start_date, pickle_id=pickle_id, ignore_dependencies=self.ignore_dependencies) ti.state = State.RUNNING if key not in started: started.append(key) self.heartbeat() executor.heartbeat() # Reacting to events for key, state in executor.get_event_buffer().items(): dag_id, task_id, execution_date = key if key not in tasks_to_run: continue ti = tasks_to_run[key] ti.refresh_from_db() if ti.state == State.FAILED: failed.append(key) logging.error("Task instance " + str(key) + " failed") del tasks_to_run[key] # Removing downstream tasks from the one that has failed for t in self.dag.get_task(task_id).get_flat_relatives( upstream=False): key = (ti.dag_id, t.task_id, execution_date) if key in tasks_to_run: wont_run.append(key) del tasks_to_run[key] elif ti.state == State.SUCCESS: succeeded.append(key) del tasks_to_run[key] msg = ("[backfill progress] " "waiting: {0} | " "succeeded: {1} | " "kicked_off: {2} | " "failed: {3} | " "skipped: {4} ").format(len(tasks_to_run), len(succeeded), len(started), len(failed), len(wont_run)) logging.info(msg) executor.end() session.close() if failed: raise AirflowException( "Some tasks instances failed, here's the list:\n" + str(failed)) logging.info("All done. Exiting.")
def process_dag(self, dag, executor): """ This method schedules a single DAG by looking at the latest run for each task and attempting to schedule the following run. As multiple schedulers may be running for redundancy, this function takes a lock on the DAG and timestamps the last run in ``last_scheduler_run``. """ DagModel = models.DagModel session = settings.Session() db_dag = session.query(DagModel).filter( DagModel.dag_id == dag.dag_id).first() last_scheduler_run = db_dag.last_scheduler_run or datetime(2000, 1, 1) secs_since_last = (datetime.now() - last_scheduler_run).total_seconds() # if db_dag.scheduler_lock or if secs_since_last < self.heartrate: session.commit() session.close() return None else: # Taking a lock db_dag.scheduler_lock = True db_dag.last_scheduler_run = datetime.now() session.commit() TI = models.TaskInstance logging.info("Getting latest instance " "for all task in dag " + dag.dag_id) sq = (session.query( TI.task_id, func.max(TI.execution_date).label('max_ti')).filter( TI.dag_id == dag.dag_id).group_by(TI.task_id).subquery('sq')) qry = session.query(TI).filter( TI.dag_id == dag.dag_id, TI.task_id == sq.c.task_id, TI.execution_date == sq.c.max_ti, ) logging.debug("Querying max dates for each task") latest_ti = qry.all() ti_dict = {ti.task_id: ti for ti in latest_ti} session.expunge_all() session.commit() logging.debug("{} rows returned".format(len(latest_ti))) for task in dag.tasks: if task.adhoc: continue if task.task_id not in ti_dict: # Brand new task, let's get started ti = TI(task, task.start_date) ti.refresh_from_db() if ti.is_queueable(flag_upstream_failed=True): logging.info('First run for {ti}'.format(**locals())) executor.queue_task_instance(ti) else: ti = ti_dict[task.task_id] ti.task = task # Hacky but worky if ti.state == State.RUNNING: continue # Only one task at a time elif ti.state == State.UP_FOR_RETRY: # If task instance if up for retry, make sure # the retry delay is met if ti.is_runnable(): logging.debug('Triggering retry: ' + str(ti)) executor.queue_task_instance(ti) elif ti.state == State.QUEUED: # If was queued we skipped so that in gets prioritized # in self.prioritize_queued continue else: # Trying to run the next schedule next_schedule = (ti.execution_date + task.schedule_interval) if (ti.task.end_date and next_schedule > ti.task.end_date): continue ti = TI( task=task, execution_date=next_schedule, ) ti.refresh_from_db() if ti.is_queueable(flag_upstream_failed=True): logging.debug('Queuing next run: ' + str(ti)) executor.queue_task_instance(ti) # Releasing the lock logging.debug("Unlocking DAG (scheduler_lock)") db_dag = (session.query(DagModel).filter( DagModel.dag_id == dag.dag_id).first()) db_dag.scheduler_lock = False session.merge(db_dag) session.commit() session.close()
def test_update_counters(self): dag = DAG(dag_id='test_manage_executor_state', start_date=DEFAULT_DATE) task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow') job = BackfillJob(dag=dag) session = settings.Session() dr = dag.create_dagrun(run_id=DagRun.ID_PREFIX, state=State.RUNNING, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE, session=session) ti = TI(task1, dr.execution_date) ti.refresh_from_db() ti_status = BackfillJob._DagRunTaskStatus() # test for success ti.set_state(State.SUCCESS, session) ti_status.running[ti.key] = ti job._update_counters(ti_status=ti_status) self.assertTrue(len(ti_status.running) == 0) self.assertTrue(len(ti_status.succeeded) == 1) self.assertTrue(len(ti_status.skipped) == 0) self.assertTrue(len(ti_status.failed) == 0) self.assertTrue(len(ti_status.to_run) == 0) ti_status.succeeded.clear() # test for skipped ti.set_state(State.SKIPPED, session) ti_status.running[ti.key] = ti job._update_counters(ti_status=ti_status) self.assertTrue(len(ti_status.running) == 0) self.assertTrue(len(ti_status.succeeded) == 0) self.assertTrue(len(ti_status.skipped) == 1) self.assertTrue(len(ti_status.failed) == 0) self.assertTrue(len(ti_status.to_run) == 0) ti_status.skipped.clear() # test for failed ti.set_state(State.FAILED, session) ti_status.running[ti.key] = ti job._update_counters(ti_status=ti_status) self.assertTrue(len(ti_status.running) == 0) self.assertTrue(len(ti_status.succeeded) == 0) self.assertTrue(len(ti_status.skipped) == 0) self.assertTrue(len(ti_status.failed) == 1) self.assertTrue(len(ti_status.to_run) == 0) ti_status.failed.clear() # test for retry ti.set_state(State.UP_FOR_RETRY, session) ti_status.running[ti.key] = ti job._update_counters(ti_status=ti_status) self.assertTrue(len(ti_status.running) == 0) self.assertTrue(len(ti_status.succeeded) == 0) self.assertTrue(len(ti_status.skipped) == 0) self.assertTrue(len(ti_status.failed) == 0) self.assertTrue(len(ti_status.to_run) == 1) ti_status.to_run.clear() # test for reschedule ti.set_state(State.UP_FOR_RESCHEDULE, session) ti_status.running[ti.key] = ti job._update_counters(ti_status=ti_status) self.assertTrue(len(ti_status.running) == 0) self.assertTrue(len(ti_status.succeeded) == 0) self.assertTrue(len(ti_status.skipped) == 0) self.assertTrue(len(ti_status.failed) == 0) self.assertTrue(len(ti_status.to_run) == 1) ti_status.to_run.clear() # test for none ti.set_state(State.NONE, session) ti_status.running[ti.key] = ti job._update_counters(ti_status=ti_status) self.assertTrue(len(ti_status.running) == 0) self.assertTrue(len(ti_status.succeeded) == 0) self.assertTrue(len(ti_status.skipped) == 0) self.assertTrue(len(ti_status.failed) == 0) self.assertTrue(len(ti_status.to_run) == 1) ti_status.to_run.clear() session.close()
def _execute(self): """ Runs a dag for a specified date range. """ session = settings.Session() start_date = self.bf_start_date end_date = self.bf_end_date # picklin' pickle_id = None if not self.donot_pickle and self.executor.__class__ not in ( executors.LocalExecutor, executors.SequentialExecutor): pickle = models.DagPickle(self.dag) session.add(pickle) session.commit() pickle_id = pickle.id executor = self.executor executor.start() executor_fails = Counter() # Build a list of all instances to run tasks_to_run = {} failed = set() succeeded = set() started = set() skipped = set() not_ready = set() deadlocked = set() for task in self.dag.tasks: if (not self.include_adhoc) and task.adhoc: continue start_date = start_date or task.start_date end_date = end_date or task.end_date or datetime.now() for dttm in self.dag.date_range(start_date, end_date=end_date): ti = models.TaskInstance(task, dttm) tasks_to_run[ti.key] = ti session.merge(ti) session.commit() # Triggering what is ready to get triggered while tasks_to_run and not deadlocked: not_ready.clear() for key, ti in list(tasks_to_run.items()): ti.refresh_from_db() ignore_depends_on_past = (self.ignore_first_depends_on_past and ti.execution_date == (start_date or ti.start_date)) # The task was already marked successful or skipped by a # different Job. Don't rerun it. if key not in started: if ti.state == State.SUCCESS: succeeded.add(key) tasks_to_run.pop(key) continue elif ti.state == State.SKIPPED: skipped.add(key) tasks_to_run.pop(key) continue # Is the task runnable? -- then run it if ti.is_queueable( include_queued=True, ignore_depends_on_past=ignore_depends_on_past, flag_upstream_failed=True): self.logger.debug('Sending {} to executor'.format(ti)) executor.queue_task_instance( ti, mark_success=self.mark_success, pickle_id=pickle_id, ignore_dependencies=self.ignore_dependencies, ignore_depends_on_past=ignore_depends_on_past, pool=self.pool) started.add(key) # Mark the task as not ready to run elif ti.state in (State.NONE, State.UPSTREAM_FAILED): self.logger.debug('Added {} to not_ready'.format(ti)) not_ready.add(key) self.heartbeat() executor.heartbeat() # If the set of tasks that aren't ready ever equals the set of # tasks to run, then the backfill is deadlocked if not_ready and not_ready == set(tasks_to_run): deadlocked.update(tasks_to_run.values()) tasks_to_run.clear() # Reacting to events for key, state in list(executor.get_event_buffer().items()): dag_id, task_id, execution_date = key if key not in tasks_to_run: continue ti = tasks_to_run[key] ti.refresh_from_db() # executor reports failure if state == State.FAILED: # task reports running if ti.state == State.RUNNING: msg = ('Executor reports that task instance {} failed ' 'although the task says it is running.'.format( key)) self.logger.error(msg) ti.handle_failure(msg) tasks_to_run.pop(key) # task reports skipped elif ti.state == State.SKIPPED: self.logger.error("Skipping {} ".format(key)) skipped.add(key) tasks_to_run.pop(key) # anything else is a failure else: self.logger.error( "Task instance {} failed".format(key)) failed.add(key) tasks_to_run.pop(key) # executor reports success elif state == State.SUCCESS: # task reports success if ti.state == State.SUCCESS: self.logger.info( 'Task instance {} succeeded'.format(key)) succeeded.add(key) tasks_to_run.pop(key) # task reports failure elif ti.state == State.FAILED: self.logger.error( "Task instance {} failed".format(key)) failed.add(key) tasks_to_run.pop(key) # task reports skipped elif ti.state == State.SKIPPED: self.logger.info( "Task instance {} skipped".format(key)) skipped.add(key) tasks_to_run.pop(key) # this probably won't ever be triggered elif ti in not_ready: self.logger.info( "{} wasn't expected to run, but it did".format(ti)) # executor reports success but task does not - this is weird elif ti.state not in (State.SUCCESS, State.QUEUED, State.UP_FOR_RETRY): self.logger.error( "The airflow run command failed " "at reporting an error. This should not occur " "in normal circumstances. Task state is '{}'," "reported state is '{}'. TI is {}" "".format(ti.state, state, ti)) # if the executor fails 3 or more times, stop trying to # run the task executor_fails[key] += 1 if executor_fails[key] >= 3: msg = ( 'The airflow run command failed to report an ' 'error for task {} three or more times. The ' 'task is being marked as failed. This is very ' 'unusual and probably means that an error is ' 'taking place before the task even ' 'starts.'.format(key)) self.logger.error(msg) ti.handle_failure(msg) tasks_to_run.pop(key) msg = ' | '.join([ "[backfill progress]", "waiting: {0}", "succeeded: {1}", "kicked_off: {2}", "failed: {3}", "skipped: {4}", "deadlocked: {5}" ]).format(len(tasks_to_run), len(succeeded), len(started), len(failed), len(skipped), len(deadlocked)) self.logger.info(msg) executor.end() session.close() err = '' if failed: err += ("---------------------------------------------------\n" "Some task instances failed:\n{}\n".format(failed)) if deadlocked: err += ('---------------------------------------------------\n' 'BackfillJob is deadlocked.') deadlocked_depends_on_past = any( t.are_dependencies_met() != t.are_dependencies_met( ignore_depends_on_past=True) for t in deadlocked) if deadlocked_depends_on_past: err += ( 'Some of the deadlocked tasks were unable to run because ' 'of "depends_on_past" relationships. Try running the ' 'backfill with the option ' '"ignore_first_depends_on_past=True" or passing "-I" at ' 'the command line.') err += ' These tasks were unable to run:\n{}\n'.format(deadlocked) if err: raise AirflowException(err) self.logger.info("Backfill done. Exiting.")
def test_backfill_max_limit_check(self): dag_id = 'test_backfill_max_limit_check' run_id = 'test_dagrun' start_date = DEFAULT_DATE - datetime.timedelta(hours=1) end_date = DEFAULT_DATE dag_run_created_cond = threading.Condition() def run_backfill(cond): cond.acquire() try: dag = self._get_dag_test_max_active_limits(dag_id) # this session object is different than the one in the main thread thread_session = settings.Session() # Existing dagrun that is not within the backfill range dag.create_dagrun( run_id=run_id, state=State.RUNNING, execution_date=DEFAULT_DATE + datetime.timedelta(hours=1), start_date=DEFAULT_DATE, ) thread_session.commit() cond.notify() finally: cond.release() executor = TestExecutor() job = BackfillJob(dag=dag, start_date=start_date, end_date=end_date, executor=executor, donot_pickle=True) job.run() thread_session.close() backfill_job_thread = threading.Thread(target=run_backfill, name="run_backfill", args=(dag_run_created_cond, )) dag_run_created_cond.acquire() session = settings.Session() backfill_job_thread.start() try: # at this point backfill can't run since the max_active_runs has been # reached, so it is waiting dag_run_created_cond.wait(timeout=1.5) dagruns = DagRun.find(dag_id=dag_id) dr = dagruns[0] self.assertEqual(1, len(dagruns)) self.assertEqual(dr.run_id, run_id) # allow the backfill to execute by setting the existing dag run to SUCCESS, # backfill will execute dag runs 1 by 1 dr.set_state(State.SUCCESS) session.merge(dr) session.commit() session.close() backfill_job_thread.join() dagruns = DagRun.find(dag_id=dag_id) self.assertEqual(3, len(dagruns)) # 2 from backfill + 1 existing self.assertEqual(dagruns[-1].run_id, dr.run_id) finally: dag_run_created_cond.release()
def run( self, verbose=True, ignore_dependencies=False, # Doesn't check for deps, just runs force=False, # Disregards previous successes mark_success=False, # Don't run the task, act as if it succeeded test_mode=False, # Doesn't record success or failure in the DB ): """ Runs the task instnace. """ task = self.task session = settings.Session() self.refresh_from_db(session) iso = datetime.now().isoformat() self.hostname = socket.gethostname() msg = "\n" msg += ("-" * 80) if self.state == State.UP_FOR_RETRY: msg += "\nRetry run {self.try_number} out of {task.retries} " msg += "starting @{iso}\n" else: msg += "\nNew run starting @{iso}\n" msg += ("-" * 80) logging.info(msg.format(**locals())) if not force and self.state == State.SUCCESS: logging.info("Task {self} previously succeeded" " on {self.end_date}".format(**locals())) elif not ignore_dependencies and \ not self.are_dependencies_met(session): logging.warning("Dependencies not met yet") elif self.state == State.UP_FOR_RETRY and \ not self.ready_for_retry(): next_run = (self.end_date + task.retry_delay).isoformat() logging.info("Not ready for retry yet. " + "Next run after {0}".format(next_run)) elif force or self.state in State.runnable(): if self.state == State.UP_FOR_RETRY: self.try_number += 1 else: self.try_number = 1 if not test_mode: session.add(Log(State.RUNNING, self)) self.state = State.RUNNING self.start_date = datetime.now() self.end_date = None if not test_mode: session.merge(self) session.commit() if verbose: if mark_success: msg = "Marking success for " else: msg = "Executing " msg += "{self.task} for {self.execution_date}" logging.info(msg.format(self=self)) try: if not mark_success: from airflow import macros tables = None if 'tables' in task.params: tables = task.params['tables'] jinja_context = { 'dag': task.dag, 'ds': self.execution_date.isoformat()[:10], 'execution_date': self.execution_date, 'macros': macros, 'params': task.params, 'tables': tables, 'task': task, 'task_instance': self, 'ti': self, } task_copy = copy.copy(task) for attr in task_copy.__class__.template_fields: source = getattr(task_copy, attr) template = self.get_template(source) setattr(task_copy, attr, template.render(**jinja_context)) task_copy.execute(self.execution_date) except Exception as e: session = settings.Session() self.end_date = datetime.now() self.set_duration() if not test_mode: session.add(Log(State.FAILED, self)) # Let's go deeper try: if self.try_number <= task.retries: self.state = State.UP_FOR_RETRY if task.email_on_retry and task.email: self.email_alert(e, is_retry=True) else: self.state = State.FAILED if task.email_on_failure and task.email: self.email_alert(e, is_retry=False) except Exception as e2: logging.error('Failed to send email to: ' + str(task.email)) logging.error(str(e2)) if not test_mode: session.merge(self) session.commit() logging.error(str(e)) raise e session = settings.Session() self.end_date = datetime.now() self.set_duration() self.state = State.SUCCESS if not test_mode: session.add(Log(State.SUCCESS, self)) session.merge(self) session.commit()
def initdb(rbac=False): session = settings.Session() from airflow import models upgradedb() merge_conn( models.Connection(conn_id='airflow_db', conn_type='mysql', host='mysql', login='******', password='', schema='airflow')) merge_conn( models.Connection(conn_id='beeline_default', conn_type='beeline', port="10000", host='localhost', extra="{\"use_beeline\": true, \"auth\": \"\"}", schema='default')) merge_conn( models.Connection(conn_id='bigquery_default', conn_type='google_cloud_platform', schema='default')) merge_conn( models.Connection(conn_id='local_mysql', conn_type='mysql', host='localhost', login='******', password='******', schema='airflow')) merge_conn( models.Connection(conn_id='presto_default', conn_type='presto', host='localhost', schema='hive', port=3400)) merge_conn( models.Connection( conn_id='google_cloud_default', conn_type='google_cloud_platform', schema='default', )) merge_conn( models.Connection( conn_id='hive_cli_default', conn_type='hive_cli', schema='default', )) merge_conn( models.Connection(conn_id='hiveserver2_default', conn_type='hiveserver2', host='localhost', schema='default', port=10000)) merge_conn( models.Connection(conn_id='metastore_default', conn_type='hive_metastore', host='localhost', extra="{\"authMechanism\": \"PLAIN\"}", port=9083)) merge_conn( models.Connection(conn_id='mongo_default', conn_type='mongo', host='mongo', port=27017)) merge_conn( models.Connection(conn_id='mysql_default', conn_type='mysql', login='******', schema='airflow', host='mysql')) merge_conn( models.Connection(conn_id='postgres_default', conn_type='postgres', login='******', password='******', schema='airflow', host='postgres')) merge_conn( models.Connection(conn_id='sqlite_default', conn_type='sqlite', host='/tmp/sqlite_default.db')) merge_conn( models.Connection(conn_id='http_default', conn_type='http', host='https://www.google.com/')) merge_conn( models.Connection(conn_id='mssql_default', conn_type='mssql', host='localhost', port=1433)) merge_conn( models.Connection(conn_id='vertica_default', conn_type='vertica', host='localhost', port=5433)) merge_conn( models.Connection(conn_id='wasb_default', conn_type='wasb', extra='{"sas_token": null}')) merge_conn( models.Connection(conn_id='webhdfs_default', conn_type='hdfs', host='localhost', port=50070)) merge_conn( models.Connection(conn_id='ssh_default', conn_type='ssh', host='localhost')) merge_conn( models.Connection(conn_id='sftp_default', conn_type='sftp', host='localhost', port=22, login='******', extra=''' {"key_file": "~/.ssh/id_rsa", "no_host_key_check": true} ''')) merge_conn( models.Connection(conn_id='fs_default', conn_type='fs', extra='{"path": "/"}')) merge_conn( models.Connection(conn_id='aws_default', conn_type='aws', extra='{"region_name": "us-east-1"}')) merge_conn( models.Connection(conn_id='spark_default', conn_type='spark', host='yarn', extra='{"queue": "root.default"}')) merge_conn( models.Connection(conn_id='druid_broker_default', conn_type='druid', host='druid-broker', port=8082, extra='{"endpoint": "druid/v2/sql"}')) merge_conn( models.Connection(conn_id='druid_ingest_default', conn_type='druid', host='druid-overlord', port=8081, extra='{"endpoint": "druid/indexer/v1/task"}')) merge_conn( models.Connection(conn_id='redis_default', conn_type='redis', host='redis', port=6379, extra='{"db": 0}')) merge_conn( models.Connection(conn_id='sqoop_default', conn_type='sqoop', host='rmdbs', extra='')) merge_conn( models.Connection(conn_id='emr_default', conn_type='emr', extra=''' { "Name": "default_job_flow_name", "LogUri": "s3://my-emr-log-bucket/default_job_flow_location", "ReleaseLabel": "emr-4.6.0", "Instances": { "Ec2KeyName": "mykey", "Ec2SubnetId": "somesubnet", "InstanceGroups": [ { "Name": "Master nodes", "Market": "ON_DEMAND", "InstanceRole": "MASTER", "InstanceType": "r3.2xlarge", "InstanceCount": 1 }, { "Name": "Slave nodes", "Market": "ON_DEMAND", "InstanceRole": "CORE", "InstanceType": "r3.2xlarge", "InstanceCount": 1 } ], "TerminationProtected": false, "KeepJobFlowAliveWhenNoSteps": false }, "Applications":[ { "Name": "Spark" } ], "VisibleToAllUsers": true, "JobFlowRole": "EMR_EC2_DefaultRole", "ServiceRole": "EMR_DefaultRole", "Tags": [ { "Key": "app", "Value": "analytics" }, { "Key": "environment", "Value": "development" } ] } ''')) merge_conn( models.Connection(conn_id='databricks_default', conn_type='databricks', host='localhost')) merge_conn( models.Connection(conn_id='qubole_default', conn_type='qubole', host='localhost')) merge_conn( models.Connection(conn_id='segment_default', conn_type='segment', extra='{"write_key": "my-segment-write-key"}')), merge_conn( models.Connection( conn_id='azure_data_lake_default', conn_type='azure_data_lake', extra='{"tenant": "<TENANT>", "account_name": "<ACCOUNTNAME>" }')) merge_conn( models.Connection( conn_id='azure_cosmos_default', conn_type='azure_cosmos', extra= '{"database_name": "<DATABASE_NAME>", "collection_name": "<COLLECTION_NAME>" }' )) merge_conn( models.Connection(conn_id='cassandra_default', conn_type='cassandra', host='cassandra', port=9042)) # Known event types KET = models.KnownEventType if not session.query(KET).filter(KET.know_event_type == 'Holiday').first(): session.add(KET(know_event_type='Holiday')) if not session.query(KET).filter(KET.know_event_type == 'Outage').first(): session.add(KET(know_event_type='Outage')) if not session.query(KET).filter( KET.know_event_type == 'Natural Disaster').first(): session.add(KET(know_event_type='Natural Disaster')) if not session.query(KET).filter( KET.know_event_type == 'Marketing Campaign').first(): session.add(KET(know_event_type='Marketing Campaign')) session.commit() dagbag = models.DagBag() # Save individual DAGs in the ORM for dag in dagbag.dags.values(): dag.sync_to_db() # Deactivate the unknown ones models.DAG.deactivate_unknown_dags(dagbag.dags.keys()) Chart = models.Chart chart_label = "Airflow task instance by type" chart = session.query(Chart).filter(Chart.label == chart_label).first() if not chart: chart = Chart( label=chart_label, conn_id='airflow_db', chart_type='bar', x_is_date=False, sql=("SELECT state, COUNT(1) as number " "FROM task_instance " "WHERE dag_id LIKE 'example%' " "GROUP BY state"), ) session.add(chart) session.commit() if rbac: from flask_appbuilder.security.sqla import models from flask_appbuilder.models.sqla import Base Base.metadata.create_all(settings.engine)
def tearDown(self): session = settings.Session() session.query(Connection).filter(Connection.conn_id == TEST_CONN).delete() session.commit() session.close()
def test_get_states_count_upstream_ti(self): """ this test tests the helper function '_get_states_count_upstream_ti' as a unit and inside update_state """ from airflow.ti_deps.dep_context import DepContext get_states_count_upstream_ti = TriggerRuleDep._get_states_count_upstream_ti session = settings.Session() now = timezone.utcnow() dag = DAG('test_dagrun_with_pre_tis', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op5 = DummyOperator(task_id='E', trigger_rule=TriggerRule.ONE_FAILED) op1.set_downstream([op2, op3]) # op1 >> op2, op3 op4.set_upstream([op3, op2]) # op3, op2 >> op4 op5.set_upstream([op2, op3, op4]) # (op2, op3, op4) >> op5 clear_db_runs() dag.clear() dr = dag.create_dagrun(run_id='test_dagrun_with_pre_tis', state=State.RUNNING, execution_date=now, start_date=now) ti_op1 = TaskInstance(task=dag.get_task(op1.task_id), execution_date=dr.execution_date) ti_op2 = TaskInstance(task=dag.get_task(op2.task_id), execution_date=dr.execution_date) ti_op3 = TaskInstance(task=dag.get_task(op3.task_id), execution_date=dr.execution_date) ti_op4 = TaskInstance(task=dag.get_task(op4.task_id), execution_date=dr.execution_date) ti_op5 = TaskInstance(task=dag.get_task(op5.task_id), execution_date=dr.execution_date) ti_op1.set_state(state=State.SUCCESS, session=session) ti_op2.set_state(state=State.FAILED, session=session) ti_op3.set_state(state=State.SUCCESS, session=session) ti_op4.set_state(state=State.SUCCESS, session=session) ti_op5.set_state(state=State.SUCCESS, session=session) session.commit() # check handling with cases that tasks are triggered from backfill with no finished tasks finished_tasks = DepContext().ensure_finished_tasks( ti_op2.task.dag, ti_op2.execution_date, session) self.assertEqual( get_states_count_upstream_ti(finished_tasks=finished_tasks, ti=ti_op2), (1, 0, 0, 0, 1)) finished_tasks = dr.get_task_instances(state=State.finished, session=session) self.assertEqual( get_states_count_upstream_ti(finished_tasks=finished_tasks, ti=ti_op4), (1, 0, 1, 0, 2)) self.assertEqual( get_states_count_upstream_ti(finished_tasks=finished_tasks, ti=ti_op5), (2, 0, 1, 0, 3)) dr.update_state() self.assertEqual(State.SUCCESS, dr.state)
def __init__(self): self.session = settings.Session()
def unregister_connection(self, connection): self.log.info(f"removing connection: {connection}") session = settings.Session() session.delete(connection) session.commit()
def test_external_task_sensor_fn_multiple_execution_dates(self): bash_command_code = """ {% set s=execution_date.time().second %} echo "second is {{ s }}" if [[ $(( {{ s }} % 60 )) == 1 ]] then exit 1 fi exit 0 """ dag_external_id = TEST_DAG_ID + '_external' dag_external = DAG( dag_external_id, default_args=self.args, schedule_interval=timedelta(seconds=1)) task_external_with_failure = BashOperator( task_id="task_external_with_failure", bash_command=bash_command_code, retries=0, dag=dag_external) task_external_without_failure = DummyOperator( task_id="task_external_without_failure", retries=0, dag=dag_external) task_external_without_failure.run( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + timedelta(seconds=1), ignore_ti_state=True) session = settings.Session() TI = TaskInstance try: task_external_with_failure.run( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + timedelta(seconds=1), ignore_ti_state=True) # The test_with_failure task is excepted to fail # once per minute (the run on the first second of # each minute). except Exception as e: # pylint: disable=broad-except failed_tis = session.query(TI).filter( TI.dag_id == dag_external_id, TI.state == State.FAILED, TI.execution_date == DEFAULT_DATE + timedelta(seconds=1)).all() if len(failed_tis) == 1 and \ failed_tis[0].task_id == 'task_external_with_failure': pass else: raise e dag_id = TEST_DAG_ID dag = DAG( dag_id, default_args=self.args, schedule_interval=timedelta(minutes=1)) task_without_failure = ExternalTaskSensor( task_id='task_without_failure', external_dag_id=dag_external_id, external_task_id='task_external_without_failure', execution_date_fn=lambda dt: [dt + timedelta(seconds=i) for i in range(2)], allowed_states=['success'], retries=0, timeout=1, poke_interval=1, dag=dag) task_with_failure = ExternalTaskSensor( task_id='task_with_failure', external_dag_id=dag_external_id, external_task_id='task_external_with_failure', execution_date_fn=lambda dt: [dt + timedelta(seconds=i) for i in range(2)], allowed_states=['success'], retries=0, timeout=1, poke_interval=1, dag=dag) task_without_failure.run( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) with self.assertRaises(AirflowSensorTimeout): task_with_failure.run( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def schedule_dag(self, dag): """ This method checks whether a new DagRun needs to be created for a DAG based on scheduling interval Returns DagRun if one is scheduled. Otherwise returns None. """ if dag.schedule_interval: DagRun = models.DagRun session = settings.Session() qry = session.query(DagRun).filter( DagRun.dag_id == dag.dag_id, DagRun.external_trigger == False, DagRun.state == State.RUNNING, ) active_runs = qry.all() if len(active_runs) >= dag.max_active_runs: return for dr in active_runs: if ( dr.start_date and dag.dagrun_timeout and dr.start_date < datetime.now() - dag.dagrun_timeout): dr.state = State.FAILED dr.end_date = datetime.now() session.commit() qry = session.query(func.max(DagRun.execution_date)).filter_by( dag_id = dag.dag_id).filter( or_(DagRun.external_trigger == False, # add % as a wildcard for the like query DagRun.run_id.like(DagRun.ID_PREFIX+'%'))) last_scheduled_run = qry.scalar() next_run_date = None if not last_scheduled_run: # First run TI = models.TaskInstance latest_run = ( session.query(func.max(TI.execution_date)) .filter_by(dag_id=dag.dag_id) .scalar() ) if latest_run: # Migrating from previous version # make the past 5 runs active next_run_date = dag.date_range(latest_run, -5)[0] else: next_run_date = min([t.start_date for t in dag.tasks]) elif dag.schedule_interval != '@once': next_run_date = dag.following_schedule(last_scheduled_run) elif dag.schedule_interval == '@once' and not last_scheduled_run: next_run_date = datetime.now() # this structure is necessary to avoid a TypeError from concatenating # NoneType if dag.schedule_interval == '@once': schedule_end = next_run_date elif next_run_date: schedule_end = dag.following_schedule(next_run_date) if next_run_date and dag.end_date and next_run_date > dag.end_date: return if next_run_date and schedule_end and schedule_end <= datetime.now(): next_run = DagRun( dag_id=dag.dag_id, run_id='scheduled__' + next_run_date.isoformat(), execution_date=next_run_date, start_date=datetime.now(), state=State.RUNNING, external_trigger=False ) session.add(next_run) session.commit() return next_run
def test_dagrun_update_state_end_date(self): session = settings.Session() dag = DAG( 'test_dagrun_update_state_end_date', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op1.set_upstream(op2) dag.clear() now = timezone.utcnow() dr = dag.create_dagrun(run_id='test_dagrun_update_state_end_date', state=State.RUNNING, execution_date=now, start_date=now) # Initial end_date should be NULL # State.SUCCESS and State.FAILED are all ending state and should set end_date # State.RUNNING set end_date back to NULL session.merge(dr) session.commit() self.assertIsNone(dr.end_date) ti_op1 = dr.get_task_instance(task_id=op1.task_id) ti_op1.set_state(state=State.SUCCESS, session=session) ti_op2 = dr.get_task_instance(task_id=op2.task_id) ti_op2.set_state(state=State.SUCCESS, session=session) dr.update_state() dr_database = session.query(DagRun).filter( DagRun.run_id == 'test_dagrun_update_state_end_date' ).one() self.assertIsNotNone(dr_database.end_date) self.assertEqual(dr.end_date, dr_database.end_date) ti_op1.set_state(state=State.RUNNING, session=session) ti_op2.set_state(state=State.RUNNING, session=session) dr.update_state() dr_database = session.query(DagRun).filter( DagRun.run_id == 'test_dagrun_update_state_end_date' ).one() self.assertEqual(dr._state, State.RUNNING) self.assertIsNone(dr.end_date) self.assertIsNone(dr_database.end_date) ti_op1.set_state(state=State.FAILED, session=session) ti_op2.set_state(state=State.FAILED, session=session) dr.update_state() dr_database = session.query(DagRun).filter( DagRun.run_id == 'test_dagrun_update_state_end_date' ).one() self.assertIsNotNone(dr_database.end_date) self.assertEqual(dr.end_date, dr_database.end_date)
def _execute(self): """ Runs a dag for a specified date range. """ session = settings.Session() start_date = self.bf_start_date end_date = self.bf_end_date # picklin' pickle_id = None if not self.donot_pickle and self.executor.__class__ not in ( executors.LocalExecutor, executors.SequentialExecutor): pickle = models.DagPickle(self.dag) session.add(pickle) session.commit() pickle_id = pickle.id executor = self.executor executor.start() # Build a list of all instances to run tasks_to_run = {} failed = [] succeeded = [] started = [] wont_run = [] for task in self.dag.tasks: if (not self.include_adhoc) and task.adhoc: continue start_date = start_date or task.start_date end_date = end_date or task.end_date or datetime.now() for dttm in self.dag.date_range(start_date, end_date=end_date): ti = models.TaskInstance(task, dttm) tasks_to_run[ti.key] = ti # Triggering what is ready to get triggered while tasks_to_run: for key, ti in list(tasks_to_run.items()): ti.refresh_from_db() if ti.state in ( State.SUCCESS, State.SKIPPED) and key in tasks_to_run: succeeded.append(key) tasks_to_run.pop(key) elif ti.state in (State.RUNNING, State.QUEUED): continue elif ti.is_runnable(flag_upstream_failed=True): executor.queue_task_instance( ti, mark_success=self.mark_success, task_start_date=self.bf_start_date, pickle_id=pickle_id, ignore_dependencies=self.ignore_dependencies, pool=self.pool) ti.state = State.RUNNING if key not in started: started.append(key) self.heartbeat() executor.heartbeat() # Reacting to events for key, state in list(executor.get_event_buffer().items()): dag_id, task_id, execution_date = key if key not in tasks_to_run: continue ti = tasks_to_run[key] ti.refresh_from_db() if ( ti.state in (State.FAILED, State.SKIPPED) or state == State.FAILED): if ti.state == State.FAILED or state == State.FAILED: failed.append(key) self.logger.error("Task instance " + str(key) + " failed") elif ti.state == State.SKIPPED: wont_run.append(key) self.logger.error("Skipping " + str(key) + " failed") tasks_to_run.pop(key) # Removing downstream tasks that also shouldn't run for t in self.dag.get_task(task_id).get_flat_relatives( upstream=False): key = (ti.dag_id, t.task_id, execution_date) if key in tasks_to_run: wont_run.append(key) tasks_to_run.pop(key) elif ti.state == State.SUCCESS and state == State.SUCCESS: succeeded.append(key) tasks_to_run.pop(key) elif ( ti.state not in (State.SUCCESS, State.QUEUED) and state == State.SUCCESS): self.logger.error( "The airflow run command failed " "at reporting an error. This should not occur " "in normal circumstances. Task state is '{}'," "reported state is '{}'. TI is {}" "".format(ti.state, state, ti)) msg = ( "[backfill progress] " "waiting: {0} | " "succeeded: {1} | " "kicked_off: {2} | " "failed: {3} | " "wont_run: {4} ").format( len(tasks_to_run), len(succeeded), len(started), len(failed), len(wont_run)) self.logger.info(msg) executor.end() session.close() if failed: msg = ( "------------------------------------------\n" "Some tasks instances failed, " "here's the list:\n{}".format(failed)) raise AirflowException(msg) self.logger.info("All done. Exiting.")
def test_backfill_fill_blanks(self): dag = DAG( 'test_backfill_fill_blanks', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}, ) with dag: op1 = DummyOperator(task_id='op1') op2 = DummyOperator(task_id='op2') op3 = DummyOperator(task_id='op3') op4 = DummyOperator(task_id='op4') op5 = DummyOperator(task_id='op5') op6 = DummyOperator(task_id='op6') dag.clear() dr = dag.create_dagrun(run_id='test', state=State.RUNNING, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE) executor = TestExecutor() session = settings.Session() tis = dr.get_task_instances() for ti in tis: if ti.task_id == op1.task_id: ti.state = State.UP_FOR_RETRY ti.end_date = DEFAULT_DATE elif ti.task_id == op2.task_id: ti.state = State.FAILED elif ti.task_id == op3.task_id: ti.state = State.SKIPPED elif ti.task_id == op4.task_id: ti.state = State.SCHEDULED elif ti.task_id == op5.task_id: ti.state = State.UPSTREAM_FAILED # op6 = None session.merge(ti) session.commit() session.close() job = BackfillJob(dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, executor=executor) self.assertRaisesRegex(AirflowException, 'Some task instances failed', job.run) self.assertRaises(sqlalchemy.orm.exc.NoResultFound, dr.refresh_from_db) # the run_id should have changed, so a refresh won't work drs = DagRun.find(dag_id=dag.dag_id, execution_date=DEFAULT_DATE) dr = drs[0] self.assertEqual(dr.state, State.FAILED) tis = dr.get_task_instances() for ti in tis: if ti.task_id in (op1.task_id, op4.task_id, op6.task_id): self.assertEqual(ti.state, State.SUCCESS) elif ti.task_id == op2.task_id: self.assertEqual(ti.state, State.FAILED) elif ti.task_id == op3.task_id: self.assertEqual(ti.state, State.SKIPPED) elif ti.task_id == op5.task_id: self.assertEqual(ti.state, State.UPSTREAM_FAILED)
def initdb(): from airflow import models upgradedb() # Creating the local_mysql DB connection C = models.Connection session = settings.Session() conn = session.query(C).filter(C.conn_id == 'local_mysql').first() if not conn: session.add( models.Connection(conn_id='local_mysql', conn_type='mysql', host='localhost', login='******', password='******', schema='airflow')) session.commit() conn = session.query(C).filter(C.conn_id == 'presto_default').first() if not conn: session.add( models.Connection(conn_id='presto_default', conn_type='presto', host='localhost', schema='hive', port=3400)) session.commit() conn = session.query(C).filter(C.conn_id == 'hive_cli_default').first() if not conn: session.add( models.Connection( conn_id='hive_cli_default', conn_type='hive_cli', schema='default', )) session.commit() conn = session.query(C).filter(C.conn_id == 'hiveserver2_default').first() if not conn: session.add( models.Connection(conn_id='hiveserver2_default', conn_type='hiveserver2', host='localhost', schema='default', port=10000)) session.commit() conn = session.query(C).filter(C.conn_id == 'metastore_default').first() if not conn: session.add( models.Connection(conn_id='metastore_default', conn_type='hive_metastore', host='localhost', port=10001)) session.commit() conn = session.query(C).filter(C.conn_id == 'mysql_default').first() if not conn: session.add( models.Connection(conn_id='mysql_default', conn_type='mysql', host='localhost')) session.commit() conn = session.query(C).filter(C.conn_id == 'sqlite_default').first() if not conn: home = conf.get('core', 'AIRFLOW_HOME') session.add( models.Connection(conn_id='sqlite_default', conn_type='sqlite', host='{}/sqlite_default.db'.format(home))) session.commit() conn = session.query(C).filter(C.conn_id == 'http_default').first() if not conn: home = conf.get('core', 'AIRFLOW_HOME') session.add( models.Connection(conn_id='http_default', conn_type='http', host='http://www.google.com')) session.commit() conn = session.query(C).filter(C.conn_id == 'mssql_default').first() if not conn: session.add( models.Connection(conn_id='mssql_default', conn_type='mssql', host='localhost', port=1433)) session.commit() conn = session.query(C).filter(C.conn_id == 'vertica_default').first() if not conn: session.add( models.Connection(conn_id='vertica_default', conn_type='vertica', host='localhost', port=5433)) session.commit() # Known event types KET = models.KnownEventType if not session.query(KET).filter(KET.know_event_type == 'Holiday').first(): session.add(KET(know_event_type='Holiday')) if not session.query(KET).filter(KET.know_event_type == 'Outage').first(): session.add(KET(know_event_type='Outage')) if not session.query(KET).filter( KET.know_event_type == 'Natural Disaster').first(): session.add(KET(know_event_type='Natural Disaster')) if not session.query(KET).filter( KET.know_event_type == 'Marketing Campaign').first(): session.add(KET(know_event_type='Marketing Campaign')) session.commit() session.close() models.DagBag(sync_to_db=True)
def initdb(): session = settings.Session() from airflow import models upgradedb() merge_conn( models.Connection( conn_id='airflow_db', conn_type='mysql', host='localhost', login='******', password='', schema='airflow')) merge_conn( models.Connection( conn_id='beeline_default', conn_type='beeline', host='localhost', schema='airflow')) merge_conn( models.Connection( conn_id='local_mysql', conn_type='mysql', host='localhost', login='******', password='******', schema='airflow')) merge_conn( models.Connection( conn_id='presto_default', conn_type='presto', host='localhost', schema='hive', port=3400)) merge_conn( models.Connection( conn_id='hive_cli_default', conn_type='hive_cli', schema='default',)) merge_conn( models.Connection( conn_id='hiveserver2_default', conn_type='hiveserver2', host='localhost', schema='default', port=10000)) merge_conn( models.Connection( conn_id='metastore_default', conn_type='hive_metastore', host='localhost', port=10001)) merge_conn( models.Connection( conn_id='mysql_default', conn_type='mysql', login='******', host='localhost')) merge_conn( models.Connection( conn_id='postgres_default', conn_type='postgres', login='******', schema='airflow', host='localhost')) merge_conn( models.Connection( conn_id='sqlite_default', conn_type='sqlite', host='/tmp/sqlite_default.db')) merge_conn( models.Connection( conn_id='http_default', conn_type='http', host='https://www.google.com/')) merge_conn( models.Connection( conn_id='mssql_default', conn_type='mssql', host='localhost', port=1433)) merge_conn( models.Connection( conn_id='vertica_default', conn_type='vertica', host='localhost', port=5433)) # Known event types KET = models.KnownEventType if not session.query(KET).filter(KET.know_event_type == 'Holiday').first(): session.add(KET(know_event_type='Holiday')) if not session.query(KET).filter(KET.know_event_type == 'Outage').first(): session.add(KET(know_event_type='Outage')) if not session.query(KET).filter( KET.know_event_type == 'Natural Disaster').first(): session.add(KET(know_event_type='Natural Disaster')) if not session.query(KET).filter( KET.know_event_type == 'Marketing Campaign').first(): session.add(KET(know_event_type='Marketing Campaign')) session.commit() models.DagBag(sync_to_db=True) Chart = models.Chart chart_label = "Airflow task instance by type" chart = session.query(Chart).filter(Chart.label == chart_label).first() if not chart: chart = Chart( label=chart_label, conn_id='airflow_db', chart_type='bar', x_is_date=False, sql=( "SELECT state, COUNT(1) as number " "FROM task_instance " "WHERE dag_id LIKE 'example%' " "GROUP BY state"), ) session.add(chart)