Example #1
0
    def __init__(
        self,
        dag_folder=None,
        executor=None,
        include_examples=conf.getboolean('core', 'LOAD_EXAMPLES'),
        safe_mode=conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE'),
        store_serialized_dags=False,
    ):

        # do not use default arg in signature, to fix import cycle on plugin load
        if executor is None:
            executor = get_default_executor()
        dag_folder = dag_folder or settings.DAGS_FOLDER
        self.dag_folder = dag_folder
        self.dags = {}
        # the file's last modified timestamp when we last read it
        self.file_last_changed = {}
        self.executor = executor
        self.import_errors = {}
        self.has_logged = False
        self.store_serialized_dags = store_serialized_dags

        self.collect_dags(dag_folder=dag_folder,
                          include_examples=include_examples,
                          safe_mode=safe_mode)
Example #2
0
def create_subdag_operator(dag_parent, label, team):
    subdag, dependencies = create_subdag(dag_parent, label, team)

    # Since v1.10, Airflow forces to use the SequentialExecutor as the default
    # executor for the SubDagOperator, so we need to explicitly specify the
    # executor from the airflow.cfg
    sd_op = SubDagOperator(
        task_id=label, dag=dag_parent, subdag=subdag, executor=get_default_executor()
    )
    return sd_op, dependencies
    def test_executors(self):
        from airflow.executors.test_plugin import PluginExecutor
        self.assertTrue(issubclass(PluginExecutor, BaseExecutor))

        from airflow.executors import get_default_executor
        self.assertTrue(issubclass(type(get_default_executor()), BaseExecutor))

        # test plugin executor import based on a name string, (like defined in airflow.cfg)
        # this is not identical to the first assertion!
        from airflow.executors import _get_executor
        self.assertTrue(issubclass(type(_get_executor('test_plugin.PluginExecutor')), BaseExecutor))
Example #4
0
 def __init__(self, executor=None, heartrate=None, *args, **kwargs):
     self.hostname = get_hostname()
     self.executor = executor or executors.get_default_executor()
     self.executor_class = executor.__class__.__name__
     self.start_date = timezone.utcnow()
     self.latest_heartbeat = timezone.utcnow()
     if heartrate is not None:
         self.heartrate = heartrate
     self.unixname = getpass.getuser()
     self.max_tis_per_query = conf.getint('scheduler', 'max_tis_per_query')
     super(BaseJob, self).__init__(*args, **kwargs)
Example #5
0
 def __init__(self,
              executor=executors.get_default_executor(),
              heartrate=conf.getfloat('scheduler', 'JOB_HEARTBEAT_SEC'),
              *args,
              **kwargs):
     self.hostname = get_hostname()
     self.executor = executor
     self.executor_class = executor.__class__.__name__
     self.start_date = timezone.utcnow()
     self.latest_heartbeat = timezone.utcnow()
     self.heartrate = heartrate
     self.unixname = getpass.getuser()
     self.max_tis_per_query = conf.getint('scheduler', 'max_tis_per_query')
     super().__init__(*args, **kwargs)
Example #6
0
def _run(args, dag, ti):
    if args.local:
        run_job = jobs.LocalTaskJob(
            task_instance=ti,
            mark_success=args.mark_success,
            pickle_id=args.pickle,
            ignore_all_deps=args.ignore_all_dependencies,
            ignore_depends_on_past=args.ignore_depends_on_past,
            ignore_task_deps=args.ignore_dependencies,
            ignore_ti_state=args.force,
            pool=args.pool)
        run_job.run()
    elif args.raw:
        ti._run_raw_task(  # pylint: disable=protected-access
            mark_success=args.mark_success,
            job_id=args.job_id,
            pool=args.pool,
        )
    else:
        pickle_id = None
        if args.ship_dag:
            try:
                # Running remotely, so pickling the DAG
                with db.create_session() as session:
                    pickle = DagPickle(dag)
                    session.add(pickle)
                    pickle_id = pickle.id
                    # TODO: This should be written to a log
                    print('Pickled dag {dag} as pickle_id: {pickle_id}'.format(
                        dag=dag, pickle_id=pickle_id))
            except Exception as e:
                print('Could not pickle the DAG')
                print(e)
                raise e

        executor = get_default_executor()
        executor.start()
        print("Sending to executor.")
        executor.queue_task_instance(
            ti,
            mark_success=args.mark_success,
            pickle_id=pickle_id,
            ignore_all_deps=args.ignore_all_dependencies,
            ignore_depends_on_past=args.ignore_depends_on_past,
            ignore_task_deps=args.ignore_dependencies,
            ignore_ti_state=args.force,
            pool=args.pool)
        executor.heartbeat()
        executor.end()
Example #7
0
def histogram_aggregates_subdag(parent_dag_name, child_dag_name, default_args,
                                schedule_interval, dataset_id):
    GLAM_HISTOGRAM_AGGREGATES_SUBDAG = "%s.%s" % (parent_dag_name,
                                                  child_dag_name)
    default_args["depends_on_past"] = True
    dag = DAG(
        GLAM_HISTOGRAM_AGGREGATES_SUBDAG,
        default_args=default_args,
        schedule_interval=schedule_interval,
    )

    clients_histogram_aggregates_new = bigquery_etl_query(
        task_id="clients_histogram_aggregates_new",
        destination_table="clients_histogram_aggregates_new_v1",
        dataset_id=dataset_id,
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter=None,
        parameters=("submission_date:DATE:{{ds}}", ),
        arguments=("--replace", ),
        dag=dag,
    )

    clients_histogram_aggregates_final = SubDagOperator(
        subdag=repeated_subdag(
            GLAM_HISTOGRAM_AGGREGATES_SUBDAG,
            GLAM_HISTOGRAM_AGGREGATES_FINAL_SUBDAG,
            default_args,
            dag.schedule_interval,
            dataset_id,
        ),
        task_id=GLAM_HISTOGRAM_AGGREGATES_FINAL_SUBDAG,
        executor=get_default_executor(),
        dag=dag,
    )

    clients_histogram_aggregates_new >> clients_histogram_aggregates_final
    return dag
Example #8
0
def extracts_subdag(parent_dag_name, child_dag_name, default_args,
                    schedule_interval, dataset_id):
    dag_id = "{}.{}".format(parent_dag_name, child_dag_name)
    dag = DAG(dag_id=dag_id,
              default_args=default_args,
              schedule_interval=schedule_interval)

    for channel in ("nightly", "beta", "release"):
        SubDagOperator(
            subdag=extract_channel_subdag(
                dag_id,
                "extract_{}".format(channel),
                default_args,
                schedule_interval,
                dataset_id,
                channel,
            ),
            task_id="extract_{}".format(channel),
            executor=get_default_executor(),
            dag=dag,
        )

    return dag
Example #9
0
def _create_balances_increment_op(dag):
    subdag = DAG(
        dag_id='%s.%s' % (dag.dag_id, 'balances_increment'),
        default_args=dag.default_args,
        schedule_interval=dag.schedule_interval,
        start_date=dag.start_date,
    )
    balances_increment_op = SubDagOperator(
        task_id='balances_increment',
        subdag=subdag,
        executor=get_default_executor(),
        dag=dag,
    )
    join_op = DummyOperator(task_id="all_done_none_failed",
                            trigger_rule=TriggerRule.NONE_FAILED,
                            dag=subdag)
    current_date: pendulum.Date = pendulum.Date.today()
    for i in range(
            1,
            commons.configs.balance_increment.NUM_DAYS_TO_MONITOR_BEFORE_TODAY
            + 1):
        balance_date: pendulum.Date = current_date.add(days=-i)
        balance_success_op = _create_check_balances_loaded_op(
            subdag, balance_date)
        soft_fail_op = _create_soft_fail_op(subdag, balance_success_op.task_id,
                                            balance_date)
        detect_delta_one_day_op = _create_detect_delta_op(subdag, balance_date)
        apply_delta_one_day_op = _create_apply_delta_op(subdag, balance_date)
        balances_increment_check_within_period_op = \
            _create_balances_increment_check_within_period_op(
                subdag,
                balance_date,
                commons.configs.balance_increment.NUM_DAYS_TO_UPDATE_AFTER_LOADING
            )
        balances_increment_check_within_period_op >> balance_success_op \
        >> soft_fail_op >> detect_delta_one_day_op >> apply_delta_one_day_op >> join_op
    return balances_increment_op
Example #10
0
    overwrite=False,
    probe_type="keyed_histogram",
    get_logs=False,
    dag=dag,
)

clients_histogram_aggregates = SubDagOperator(
    subdag=histogram_aggregates_subdag(
        GLAM_DAG,
        GLAM_CLIENTS_HISTOGRAM_AGGREGATES_SUBDAG,
        default_args,
        dag.schedule_interval,
        dataset_id,
    ),
    task_id=GLAM_CLIENTS_HISTOGRAM_AGGREGATES_SUBDAG,
    executor=get_default_executor(),
    dag=dag,
)

histogram_percentiles = bigquery_etl_query(
    task_id="histogram_percentiles",
    destination_table="histogram_percentiles_v1",
    dataset_id=dataset_id,
    project_id=project_id,
    owner="*****@*****.**",
    date_partition_parameter=None,
    arguments=("--replace", ),
    dag=dag,
)

glam_user_counts = bigquery_etl_query(