def __init__( self, dag_folder=None, executor=None, include_examples=conf.getboolean('core', 'LOAD_EXAMPLES'), safe_mode=conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE'), store_serialized_dags=False, ): # do not use default arg in signature, to fix import cycle on plugin load if executor is None: executor = get_default_executor() dag_folder = dag_folder or settings.DAGS_FOLDER self.dag_folder = dag_folder self.dags = {} # the file's last modified timestamp when we last read it self.file_last_changed = {} self.executor = executor self.import_errors = {} self.has_logged = False self.store_serialized_dags = store_serialized_dags self.collect_dags(dag_folder=dag_folder, include_examples=include_examples, safe_mode=safe_mode)
def create_subdag_operator(dag_parent, label, team): subdag, dependencies = create_subdag(dag_parent, label, team) # Since v1.10, Airflow forces to use the SequentialExecutor as the default # executor for the SubDagOperator, so we need to explicitly specify the # executor from the airflow.cfg sd_op = SubDagOperator( task_id=label, dag=dag_parent, subdag=subdag, executor=get_default_executor() ) return sd_op, dependencies
def test_executors(self): from airflow.executors.test_plugin import PluginExecutor self.assertTrue(issubclass(PluginExecutor, BaseExecutor)) from airflow.executors import get_default_executor self.assertTrue(issubclass(type(get_default_executor()), BaseExecutor)) # test plugin executor import based on a name string, (like defined in airflow.cfg) # this is not identical to the first assertion! from airflow.executors import _get_executor self.assertTrue(issubclass(type(_get_executor('test_plugin.PluginExecutor')), BaseExecutor))
def __init__(self, executor=None, heartrate=None, *args, **kwargs): self.hostname = get_hostname() self.executor = executor or executors.get_default_executor() self.executor_class = executor.__class__.__name__ self.start_date = timezone.utcnow() self.latest_heartbeat = timezone.utcnow() if heartrate is not None: self.heartrate = heartrate self.unixname = getpass.getuser() self.max_tis_per_query = conf.getint('scheduler', 'max_tis_per_query') super(BaseJob, self).__init__(*args, **kwargs)
def __init__(self, executor=executors.get_default_executor(), heartrate=conf.getfloat('scheduler', 'JOB_HEARTBEAT_SEC'), *args, **kwargs): self.hostname = get_hostname() self.executor = executor self.executor_class = executor.__class__.__name__ self.start_date = timezone.utcnow() self.latest_heartbeat = timezone.utcnow() self.heartrate = heartrate self.unixname = getpass.getuser() self.max_tis_per_query = conf.getint('scheduler', 'max_tis_per_query') super().__init__(*args, **kwargs)
def _run(args, dag, ti): if args.local: run_job = jobs.LocalTaskJob( task_instance=ti, mark_success=args.mark_success, pickle_id=args.pickle, ignore_all_deps=args.ignore_all_dependencies, ignore_depends_on_past=args.ignore_depends_on_past, ignore_task_deps=args.ignore_dependencies, ignore_ti_state=args.force, pool=args.pool) run_job.run() elif args.raw: ti._run_raw_task( # pylint: disable=protected-access mark_success=args.mark_success, job_id=args.job_id, pool=args.pool, ) else: pickle_id = None if args.ship_dag: try: # Running remotely, so pickling the DAG with db.create_session() as session: pickle = DagPickle(dag) session.add(pickle) pickle_id = pickle.id # TODO: This should be written to a log print('Pickled dag {dag} as pickle_id: {pickle_id}'.format( dag=dag, pickle_id=pickle_id)) except Exception as e: print('Could not pickle the DAG') print(e) raise e executor = get_default_executor() executor.start() print("Sending to executor.") executor.queue_task_instance( ti, mark_success=args.mark_success, pickle_id=pickle_id, ignore_all_deps=args.ignore_all_dependencies, ignore_depends_on_past=args.ignore_depends_on_past, ignore_task_deps=args.ignore_dependencies, ignore_ti_state=args.force, pool=args.pool) executor.heartbeat() executor.end()
def histogram_aggregates_subdag(parent_dag_name, child_dag_name, default_args, schedule_interval, dataset_id): GLAM_HISTOGRAM_AGGREGATES_SUBDAG = "%s.%s" % (parent_dag_name, child_dag_name) default_args["depends_on_past"] = True dag = DAG( GLAM_HISTOGRAM_AGGREGATES_SUBDAG, default_args=default_args, schedule_interval=schedule_interval, ) clients_histogram_aggregates_new = bigquery_etl_query( task_id="clients_histogram_aggregates_new", destination_table="clients_histogram_aggregates_new_v1", dataset_id=dataset_id, project_id="moz-fx-data-shared-prod", owner="*****@*****.**", email=["*****@*****.**", "*****@*****.**"], date_partition_parameter=None, parameters=("submission_date:DATE:{{ds}}", ), arguments=("--replace", ), dag=dag, ) clients_histogram_aggregates_final = SubDagOperator( subdag=repeated_subdag( GLAM_HISTOGRAM_AGGREGATES_SUBDAG, GLAM_HISTOGRAM_AGGREGATES_FINAL_SUBDAG, default_args, dag.schedule_interval, dataset_id, ), task_id=GLAM_HISTOGRAM_AGGREGATES_FINAL_SUBDAG, executor=get_default_executor(), dag=dag, ) clients_histogram_aggregates_new >> clients_histogram_aggregates_final return dag
def extracts_subdag(parent_dag_name, child_dag_name, default_args, schedule_interval, dataset_id): dag_id = "{}.{}".format(parent_dag_name, child_dag_name) dag = DAG(dag_id=dag_id, default_args=default_args, schedule_interval=schedule_interval) for channel in ("nightly", "beta", "release"): SubDagOperator( subdag=extract_channel_subdag( dag_id, "extract_{}".format(channel), default_args, schedule_interval, dataset_id, channel, ), task_id="extract_{}".format(channel), executor=get_default_executor(), dag=dag, ) return dag
def _create_balances_increment_op(dag): subdag = DAG( dag_id='%s.%s' % (dag.dag_id, 'balances_increment'), default_args=dag.default_args, schedule_interval=dag.schedule_interval, start_date=dag.start_date, ) balances_increment_op = SubDagOperator( task_id='balances_increment', subdag=subdag, executor=get_default_executor(), dag=dag, ) join_op = DummyOperator(task_id="all_done_none_failed", trigger_rule=TriggerRule.NONE_FAILED, dag=subdag) current_date: pendulum.Date = pendulum.Date.today() for i in range( 1, commons.configs.balance_increment.NUM_DAYS_TO_MONITOR_BEFORE_TODAY + 1): balance_date: pendulum.Date = current_date.add(days=-i) balance_success_op = _create_check_balances_loaded_op( subdag, balance_date) soft_fail_op = _create_soft_fail_op(subdag, balance_success_op.task_id, balance_date) detect_delta_one_day_op = _create_detect_delta_op(subdag, balance_date) apply_delta_one_day_op = _create_apply_delta_op(subdag, balance_date) balances_increment_check_within_period_op = \ _create_balances_increment_check_within_period_op( subdag, balance_date, commons.configs.balance_increment.NUM_DAYS_TO_UPDATE_AFTER_LOADING ) balances_increment_check_within_period_op >> balance_success_op \ >> soft_fail_op >> detect_delta_one_day_op >> apply_delta_one_day_op >> join_op return balances_increment_op
overwrite=False, probe_type="keyed_histogram", get_logs=False, dag=dag, ) clients_histogram_aggregates = SubDagOperator( subdag=histogram_aggregates_subdag( GLAM_DAG, GLAM_CLIENTS_HISTOGRAM_AGGREGATES_SUBDAG, default_args, dag.schedule_interval, dataset_id, ), task_id=GLAM_CLIENTS_HISTOGRAM_AGGREGATES_SUBDAG, executor=get_default_executor(), dag=dag, ) histogram_percentiles = bigquery_etl_query( task_id="histogram_percentiles", destination_table="histogram_percentiles_v1", dataset_id=dataset_id, project_id=project_id, owner="*****@*****.**", date_partition_parameter=None, arguments=("--replace", ), dag=dag, ) glam_user_counts = bigquery_etl_query(