Esempio n. 1
0
    def test_templated_sensor(self):
        dag = DAG(TEST_DAG_ID, self.args)

        with dag:
            sensor = ExternalTaskSensor(
                task_id='templated_task',
                external_dag_id='dag_{{ ds }}',
                external_task_id='task_{{ ds }}',
                start_date=DEFAULT_DATE
            )

        instance = TaskInstance(sensor, DEFAULT_DATE)
        instance.render_templates()

        self.assertEqual(sensor.external_dag_id,
                         "dag_{}".format(DEFAULT_DATE.date()))
        self.assertEqual(sensor.external_task_id,
                         "task_{}".format(DEFAULT_DATE.date()))
def buildSubDag(dag_id, filePath, dagIdToWait):

    with DAG(
        dag_id=dag_id,
        schedule_interval=None,
        start_date=datetime(2018, 11, 11)
    ) as dag:

        def takeTime(**context):
            time = context['ti'].xcom_pull(task_ids=pushTimeTaskId)
            print(f'The time was taken {time}')
            return time
        
        externalTaskSensor = ExternalTaskSensor(
            task_id="wait_for_other_dag",
            external_dag_id=dagIdToWait,
            execution_delta=timedelta(minutes=0),
            external_task_id=None,
        )

        printOperator = PythonOperator(
            task_id="print_time",
            python_callable=takeTime,
            provide_context=True
        )
        
        removeFileOp = BashOperator(
            task_id="remove_file",
            bash_command="rm {}".format(filePath)
        )
        
        finishedDateOp = BashOperator(
            task_id="_create_finish_date_file",
            bash_command="touch {}/finished_{}".format(placeToStoreResultFile, "{{ ds }}")
        )
        
        externalTaskSensor >> printOperator >> removeFileOp >> finishedDateOp
        return dag
Esempio n. 3
0
    def test_external_task_sensor_fn_multiple_execution_dates(self):
        bash_command_code = """
{% set s=execution_date.time().second %}
echo "second is {{ s }}"
if [[ $(( {{ s }} % 60 )) == 1 ]]
    then
        exit 1
fi
exit 0
"""
        dag_external_id = TEST_DAG_ID + '_external'
        dag_external = DAG(dag_external_id,
                           default_args=self.args,
                           schedule_interval=timedelta(seconds=1))
        task_external_with_failure = BashOperator(
            task_id="task_external_with_failure",
            bash_command=bash_command_code,
            retries=0,
            dag=dag_external)
        task_external_without_failure = DummyOperator(
            task_id="task_external_without_failure",
            retries=0,
            dag=dag_external)

        task_external_without_failure.run(start_date=DEFAULT_DATE,
                                          end_date=DEFAULT_DATE +
                                          timedelta(seconds=1),
                                          ignore_ti_state=True)

        session = settings.Session()
        TI = TaskInstance
        try:
            task_external_with_failure.run(start_date=DEFAULT_DATE,
                                           end_date=DEFAULT_DATE +
                                           timedelta(seconds=1),
                                           ignore_ti_state=True)
            # The test_with_failure task is excepted to fail
            # once per minute (the run on the first second of
            # each minute).
        except Exception as e:
            failed_tis = session.query(TI).filter(
                TI.dag_id == dag_external_id, TI.state == State.FAILED,
                TI.execution_date == DEFAULT_DATE +
                timedelta(seconds=1)).all()
            if (len(failed_tis) == 1
                    and failed_tis[0].task_id == 'task_external_with_failure'):
                pass
            else:
                raise e

        dag_id = TEST_DAG_ID
        dag = DAG(dag_id,
                  default_args=self.args,
                  schedule_interval=timedelta(minutes=1))
        task_without_failure = ExternalTaskSensor(
            task_id='task_without_failure',
            external_dag_id=dag_external_id,
            external_task_id='task_external_without_failure',
            execution_date_fn=lambda dt:
            [dt + timedelta(seconds=i) for i in range(2)],
            allowed_states=['success'],
            retries=0,
            timeout=1,
            poke_interval=1,
            dag=dag)
        task_with_failure = ExternalTaskSensor(
            task_id='task_with_failure',
            external_dag_id=dag_external_id,
            external_task_id='task_external_with_failure',
            execution_date_fn=lambda dt:
            [dt + timedelta(seconds=i) for i in range(2)],
            allowed_states=['success'],
            retries=0,
            timeout=1,
            poke_interval=1,
            dag=dag)

        task_without_failure.run(start_date=DEFAULT_DATE,
                                 end_date=DEFAULT_DATE,
                                 ignore_ti_state=True)

        with self.assertRaises(AirflowSensorTimeout):
            task_with_failure.run(start_date=DEFAULT_DATE,
                                  end_date=DEFAULT_DATE,
                                  ignore_ti_state=True)
with DAG(
        "bhr_collection",
        default_args=default_args,
        schedule_interval="0 5 * * *",
) as dag:
    # Jobs read from/write to s3://telemetry-public-analysis-2/bhr/data/hang_aggregates/
    write_aws_conn_id = 'aws_dev_telemetry_public_analysis_2_rw'
    aws_access_key, aws_secret_key, _ = AwsHook(
        write_aws_conn_id).get_credentials()

    wait_for_bhr_ping = ExternalTaskSensor(
        task_id="wait_for_bhr_ping",
        external_dag_id="copy_deduplicate",
        external_task_id="copy_deduplicate_all",
        execution_delta=datetime.timedelta(hours=4),
        check_existence=True,
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
        dag=dag,
    )

    params = get_dataproc_parameters("google_cloud_airflow_dataproc")

    bhr_collection = SubDagOperator(
        task_id="bhr_collection",
        dag=dag,
        subdag=moz_dataproc_pyspark_runner(
            parent_dag_name=dag.dag_id,
            image_version="1.5",
            dag_name="bhr_collection",
            default_args=default_args,
        task_id="internet_outages__global_outages__v1",
        destination_table="global_outages_v1",
        dataset_id="internet_outages",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    wait_for_copy_deduplicate_all = ExternalTaskSensor(
        task_id="wait_for_copy_deduplicate_all",
        external_dag_id="copy_deduplicate",
        external_task_id="copy_deduplicate_all",
        execution_delta=datetime.timedelta(seconds=7200),
        check_existence=True,
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
    )

    internet_outages__global_outages__v1.set_upstream(
        wait_for_copy_deduplicate_all)
    wait_for_copy_deduplicate_main_ping = ExternalTaskSensor(
        task_id="wait_for_copy_deduplicate_main_ping",
        external_dag_id="copy_deduplicate",
        external_task_id="copy_deduplicate_main_ping",
        execution_delta=datetime.timedelta(seconds=7200),
        check_existence=True,
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
Esempio n. 6
0
        task_id="telemetry_derived__addon_names__v1",
        destination_table="addon_names_v1",
        dataset_id="telemetry_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**"],
        date_partition_parameter=None,
        depends_on_past=False,
        parameters=["submission_date:DATE:{{ds}}"],
        dag=dag,
    )

    wait_for_search_derived__search_clients_daily__v8 = ExternalTaskSensor(
        task_id="wait_for_search_derived__search_clients_daily__v8",
        external_dag_id="bqetl_search",
        external_task_id="search_derived__search_clients_daily__v8",
        check_existence=True,
        mode="reschedule",
    )

    telemetry_derived__addons_daily__v1.set_upstream(
        wait_for_search_derived__search_clients_daily__v8)
    wait_for_telemetry_derived__clients_last_seen__v1 = ExternalTaskSensor(
        task_id="wait_for_telemetry_derived__clients_last_seen__v1",
        external_dag_id="bqetl_clients_daily",
        external_task_id="telemetry_derived__clients_last_seen__v1",
        check_existence=True,
        mode="reschedule",
    )

    telemetry_derived__addons_daily__v1.set_upstream(
Esempio n. 7
0
    'start_date': datetime(2019, 9, 30),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('fx_usage_report',
          default_args=default_args,
          schedule_interval='@weekly')

wait_for_main_summary = ExternalTaskSensor(
    task_id='wait_for_main_summary',
    external_dag_id='parquet_export',
    external_task_id='main_summary_export',
    execution_delta=timedelta(
        days=-7, hours=-2
    ),  # main_summary waits two hours, execution date is beginning of the week
    dag=dag)

cluster_name = 'fx-usage-report-dataproc-cluster'
gcp_conn_id = 'google_cloud_airflow_dataproc'

# AWS credentials to read/write from output bucket
aws_conn_id = 'aws_prod_fx_usage_report'
aws_access_key, aws_secret_key, session = AwsHook(
    aws_conn_id).get_credentials()

output_bucket = 'net-mozaws-prod-us-west-2-data-public'

usage_report = SubDagOperator(
Esempio n. 8
0
        task_id="telemetry_derived__smoot_usage_nondesktop_compressed__v2",
        destination_table="smoot_usage_nondesktop_compressed_v2",
        dataset_id="telemetry_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    wait_for_telemetry_derived__clients_last_seen__v1 = ExternalTaskSensor(
        task_id="wait_for_telemetry_derived__clients_last_seen__v1",
        external_dag_id="bqetl_main_summary",
        external_task_id="telemetry_derived__clients_last_seen__v1",
        execution_delta=datetime.timedelta(seconds=3600),
        check_existence=True,
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
    )

    telemetry_derived__smoot_usage_desktop__v2.set_upstream(
        wait_for_telemetry_derived__clients_last_seen__v1)

    telemetry_derived__smoot_usage_desktop_compressed__v2.set_upstream(
        telemetry_derived__smoot_usage_desktop__v2)

    wait_for_firefox_accounts_derived__fxa_users_last_seen__v1 = ExternalTaskSensor(
        task_id="wait_for_firefox_accounts_derived__fxa_users_last_seen__v1",
        external_dag_id="bqetl_fxa_events",
        external_task_id="firefox_accounts_derived__fxa_users_last_seen__v1",
    # If a task fails, retry it once after waiting at least 5 minutes
    "retries": 1,
    "retry_delay": datetime.timedelta(minutes=5),
}

dag_name = "fission_experiment_monitoring"

with models.DAG(
        dag_name,
        schedule_interval="0 2 * * *",
        default_args=default_args) as dag:

    wait_for_copy_deduplicate_main_ping = ExternalTaskSensor(
        task_id="wait_for_copy_deduplicate_main_ping",
        external_dag_id="copy_deduplicate",
        external_task_id="copy_deduplicate_main_ping",
        execution_delta=datetime.timedelta(hours=1),
        dag=dag,
    )

    fission_monitoring_main_v1 = bigquery_etl_query(
        task_id="fission_monitoring_main_v1",
        project_id="moz-fx-data-shared-prod",
        destination_table="fission_monitoring_main_v1",
        dataset_id="telemetry_derived",
        arguments=('--schema_update_option=ALLOW_FIELD_ADDITION',),
    )

    wait_for_copy_deduplicate_crash_ping = ExternalTaskSensor(
        task_id="wait_for_copy_deduplicate_crash_ping",
        external_dag_id="copy_deduplicate",
Esempio n. 10
0
        artifact_bucket=params.artifact_bucket,
        storage_bucket=params.storage_bucket,
        default_args=subdag_args,
    ),
)

if params.is_dev:
    copy_to_dev = copy_artifacts_dev(dag, params.project_id,
                                     params.artifact_bucket,
                                     params.storage_bucket)
    copy_to_dev >> ltv_daily
else:
    wait_for_search_clients_last_seen = ExternalTaskSensor(
        task_id="wait_for_search_clients_last_seen",
        external_dag_id="bqetl_search",
        external_task_id="search_derived__search_clients_last_seen__v1",
        execution_delta=timedelta(hours=1),
        check_existence=True,
        dag=dag,
    )
    wait_for_search_clients_last_seen >> ltv_daily

response = urlopen('/'.join([
    'https://raw.githubusercontent.com/mozilla/bigquery-etl/master/sql',
    'revenue_derived', 'client_ltv_v1', 'query.sql'
]))

BigQueryOperator.template_fields += ('query_params', )
ltv_revenue_join = BigQueryOperator(
    task_id='ltv_revenue_join',
    sql=response.read().decode('utf-8'),
    query_params=[{
    core_clients_last_seen = bigquery_etl_query(
        task_id='core_clients_last_seen',
        destination_table='core_clients_last_seen_raw_v1',
        dataset_id='telemetry',
        depends_on_past=True,
    )

    core_clients_daily >> core_clients_last_seen

    # Daily and last seen views on top of glean pings.

    wait_for_copy_deduplicate = ExternalTaskSensor(
        task_id="wait_for_copy_deduplicate",
        external_dag_id="copy_deduplicate",
        external_task_id="copy_deduplicate_all",
        execution_delta=timedelta(hours=1),
        dag=dag,
    )

    fenix_clients_daily = bigquery_etl_query(
        task_id='fenix_clients_daily',
        destination_table=
        'moz-fx-data-shared-prod:org_mozilla_fenix_derived.clients_daily_v1',
        sql_file_path=
        'sql/org_mozilla_fenix_derived/clients_daily_v1/query.sql',
        dataset_id='org_mozilla_fenix_derived',
        start_date=datetime(2019, 9, 5),
    )

    fenix_clients_daily << wait_for_copy_deduplicate
Esempio n. 12
0
    # top_signatures_correlations uploads results to public analysis bucket
    write_aws_conn_id = "aws_dev_telemetry_public_analysis_2_rw"
    analysis_access_key, analysis_secret_key, _ = AwsHook(
        write_aws_conn_id).get_credentials()

    # modules_with_missing_symbols sends results as email
    ses_aws_conn_id = "aws_data_iam_ses"
    ses_access_key, ses_secret_key, _ = AwsHook(
        ses_aws_conn_id).get_credentials()

    wait_for_socorro_import = ExternalTaskSensor(
        task_id="wait_for_socorro_import",
        external_dag_id="socorro_import",
        external_task_id="bigquery_load",
        check_existence=True,
        execution_delta=datetime.timedelta(hours=5),
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
        email_on_retry=False,
        dag=dag,
    )

    params = get_dataproc_parameters("google_cloud_airflow_dataproc")

    modules_with_missing_symbols = SubDagOperator(
        task_id="modules_with_missing_symbols",
        dag=dag,
        subdag=moz_dataproc_pyspark_runner(
            parent_dag_name=dag.dag_id,
            image_version="1.5",
            dag_name="modules_with_missing_symbols",
    name="taar-update-whitelist",
    # This uses a circleci built docker image from github.com/mozilla/taar_gcp_etl
    image="gcr.io/moz-fx-data-airflow-prod-88e0/taar_gcp_etl:0.1",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**", "*****@*****.**"],
    arguments=["-m", "taar_etl.taar_update_whitelist", "--date", "{{ ds_nodash }}"],
    env_vars={
        "AWS_ACCESS_KEY_ID": taar_aws_access_key,
        "AWS_SECRET_ACCESS_KEY": taar_aws_secret_key,
    },
    dag=dag,
)

wait_for_clients_daily_export = ExternalTaskSensor(
    task_id="wait_for_clients_daily_export",
    external_dag_id="main_summary",
    external_task_id="clients_daily_export",
    dag=dag)

wait_for_main_summary_export = ExternalTaskSensor(
    task_id="wait_for_main_summary_export",
    external_dag_id="main_summary",
    external_task_id="main_summary_export",
    dag=dag)


taar_locale = SubDagOperator(
    task_id="taar_locale",
    subdag=moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,
        dag_name="taar_locale",
Esempio n. 14
0
    True,
    "retries":
    2,
}

with DAG("bqetl_deviations",
         default_args=default_args,
         schedule_interval="0 1 * * *") as dag:

    telemetry_derived__deviations__v1 = bigquery_etl_query(
        task_id="telemetry_derived__deviations__v1",
        destination_table="deviations_v1",
        dataset_id="telemetry_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    wait_for_anomdtct_anomdtct = ExternalTaskSensor(
        task_id="wait_for_anomdtct_anomdtct",
        external_dag_id="anomdtct",
        external_task_id="anomdtct",
        check_existence=True,
        dag=dag,
    )

    telemetry_derived__deviations__v1.set_upstream(wait_for_anomdtct_anomdtct)
Esempio n. 15
0
    def test_external_task_sensor_fn_multiple_execution_dates(self):
        bash_command_code = """
{% set s=execution_date.time().second %}
echo "second is {{ s }}"
if [[ $(( {{ s }} % 60 )) == 1 ]]
    then
        exit 1
fi
exit 0
"""
        dag_external_id = TEST_DAG_ID + '_external'
        dag_external = DAG(
            dag_external_id,
            default_args=self.args,
            schedule_interval=timedelta(seconds=1))
        task_external_with_failure = BashOperator(
            task_id="task_external_with_failure",
            bash_command=bash_command_code,
            retries=0,
            dag=dag_external)
        task_external_without_failure = DummyOperator(
            task_id="task_external_without_failure",
            retries=0,
            dag=dag_external)

        task_external_without_failure.run(
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE + timedelta(seconds=1),
            ignore_ti_state=True)

        session = settings.Session()
        TI = TaskInstance
        try:
            task_external_with_failure.run(
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE + timedelta(seconds=1),
                ignore_ti_state=True)
            # The test_with_failure task is excepted to fail
            # once per minute (the run on the first second of
            # each minute).
        except Exception as e:
            failed_tis = session.query(TI).filter(
                TI.dag_id == dag_external_id,
                TI.state == State.FAILED,
                TI.execution_date == DEFAULT_DATE + timedelta(seconds=1)).all()
            if (len(failed_tis) == 1 and
                    failed_tis[0].task_id == 'task_external_with_failure'):
                pass
            else:
                raise e

        dag_id = TEST_DAG_ID
        dag = DAG(
            dag_id,
            default_args=self.args,
            schedule_interval=timedelta(minutes=1))
        task_without_failure = ExternalTaskSensor(
            task_id='task_without_failure',
            external_dag_id=dag_external_id,
            external_task_id='task_external_without_failure',
            execution_date_fn=lambda dt: [dt + timedelta(seconds=i)
                                          for i in range(2)],
            allowed_states=['success'],
            retries=0,
            timeout=1,
            poke_interval=1,
            dag=dag)
        task_with_failure = ExternalTaskSensor(
            task_id='task_with_failure',
            external_dag_id=dag_external_id,
            external_task_id='task_external_with_failure',
            execution_date_fn=lambda dt: [dt + timedelta(seconds=i)
                                          for i in range(2)],
            allowed_states=['success'],
            retries=0,
            timeout=1,
            poke_interval=1,
            dag=dag)

        task_without_failure.run(
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            ignore_ti_state=True)

        with self.assertRaises(AirflowSensorTimeout):
            task_with_failure.run(
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE,
                ignore_ti_state=True)
Esempio n. 16
0
    "retries": 2,
}

with DAG(
    "bqetl_asn_aggregates", default_args=default_args, schedule_interval="0 2 * * *"
) as dag:

    telemetry_derived__asn_aggregates__v1 = bigquery_etl_query(
        task_id="telemetry_derived__asn_aggregates__v1",
        destination_table="asn_aggregates_v1",
        dataset_id="telemetry_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        parameters=["n_clients:INT64:500"],
        dag=dag,
    )

    wait_for_bq_main_events = ExternalTaskSensor(
        task_id="wait_for_bq_main_events",
        external_dag_id="copy_deduplicate",
        external_task_id="bq_main_events",
        execution_delta=datetime.timedelta(seconds=3600),
        check_existence=True,
        mode="reschedule",
    )

    telemetry_derived__asn_aggregates__v1.set_upstream(wait_for_bq_main_events)
    "retries": 2,
    "retry_delay": timedelta(minutes=10),
}

dag = DAG("public_data_hardware_report", default_args=default_args, schedule_interval="0 1 * * MON")

# Required to write json output to s3://telemetry-public-analysis-2/public-data-report/hardware/
write_aws_conn_id='aws_dev_telemetry_public_analysis_2_rw'
aws_access_key, aws_secret_key, session = AwsHook(write_aws_conn_id).get_credentials()

# hardware_report's execution date will be {now}-7days. It will read last week's main pings,
# therefore we need to wait for yesterday's Main Ping deduplication task to finish
wait_for_main_ping = ExternalTaskSensor(
    task_id="wait_for_main_ping",
    external_dag_id="main_summary",
    external_task_id="copy_deduplicate_main_ping",
    execution_delta=timedelta(days=-6),
    check_existence=True,
    dag=dag,
)

params = get_dataproc_parameters("google_cloud_airflow_dataproc")

hardware_report = SubDagOperator(
    task_id="public_data_hardware_report",
    dag=dag,
    subdag = moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,
        dag_name="public_data_hardware_report",
        default_args=default_args,
        cluster_name="public-data-hardware-report-{{ ds }}",
        job_name="Firefox_Public_Data_Hardware_Report-{{ ds }}",
Esempio n. 18
0
    'owner': 'datascience',
    'depends_on_past': False,
    'start_date': days_ago(1),
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**']
}

dag = DAG(
    'email_discounts',
    schedule_interval='@hourly',
    catchup=False,
    default_args=default_args,
    description=
    'Email discounts to customers that have experienced order delays daily.')

# Wait for delivery_times_7_days DAG to complete
t1 = ExternalTaskSensor(task_id='wait_for_delivery_times_7_days',
                        external_dag_id='delivery_times_7_days',
                        mode='reschedule',
                        dag=dag)

t2 = PostgresOperator(task_id='insert',
                      postgres_conn_id='food_delivery_db',
                      sql='''
    SELECT * FROM discounts;
    ''',
                      dag=dag)

t1 >> t2
Esempio n. 19
0
}

GLAM_DAG = "glam"
GLAM_CLIENTS_HISTOGRAM_AGGREGATES_SUBDAG = "clients_histogram_aggregates"
PERCENT_RELEASE_WINDOWS_SAMPLING = "10"

dag = DAG(GLAM_DAG, default_args=default_args, schedule_interval="0 2 * * *")

gcp_conn = GoogleCloudBaseHook("google_cloud_airflow_dataproc")

# Make sure all the data for the given day has arrived before running.
wait_for_main_ping = ExternalTaskSensor(
    task_id="wait_for_main_ping",
    project_id=project_id,
    external_dag_id="copy_deduplicate",
    external_task_id="copy_deduplicate_main_ping",
    execution_delta=timedelta(hours=1),
    check_existence=True,
    dag=dag,
)

latest_versions = bigquery_etl_query(
    task_id="latest_versions",
    destination_table="latest_versions",
    dataset_id=dataset_id,
    project_id=project_id,
    owner="*****@*****.**",
    date_partition_parameter=None,
    arguments=("--replace", ),
    dag=dag,
)
Esempio n. 20
0
        email=["*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    search_derived__search_metric_contribution__v1.set_upstream(
        search_derived__search_clients_daily__v8
    )

    search_derived__search_aggregates__v8.set_upstream(
        search_derived__search_clients_daily__v8
    )

    search_derived__search_clients_last_seen__v1.set_upstream(
        search_derived__search_clients_daily__v8
    )

    wait_for_main_summary_main_summary = ExternalTaskSensor(
        task_id="wait_for_main_summary_main_summary",
        external_dag_id="main_summary",
        external_task_id="main_summary",
        check_existence=True,
        mode="reschedule",
        dag=dag,
    )

    search_derived__search_clients_daily__v8.set_upstream(
        wait_for_main_summary_main_summary
    )
        task_id="telemetry_derived__adm_engagements_daily__v1",
        destination_table="adm_engagements_daily_v1",
        dataset_id="telemetry_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    wait_for_bq_main_events = ExternalTaskSensor(
        task_id="wait_for_bq_main_events",
        external_dag_id="copy_deduplicate",
        external_task_id="bq_main_events",
        execution_delta=datetime.timedelta(seconds=3600),
        check_existence=True,
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
    )

    telemetry_derived__adm_engagements_daily__v1.set_upstream(wait_for_bq_main_events)
    wait_for_event_events = ExternalTaskSensor(
        task_id="wait_for_event_events",
        external_dag_id="copy_deduplicate",
        external_task_id="event_events",
        execution_delta=datetime.timedelta(seconds=3600),
        check_existence=True,
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
    )
Esempio n. 22
0
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    messaging_system_derived__cfr_exact_mau28_by_dimensions__v1.set_upstream(
        messaging_system_derived__cfr_users_last_seen__v1)

    wait_for_copy_deduplicate_all = ExternalTaskSensor(
        task_id="wait_for_copy_deduplicate_all",
        external_dag_id="copy_deduplicate",
        external_task_id="copy_deduplicate_all",
        execution_delta=datetime.timedelta(seconds=3600),
        check_existence=True,
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
    )

    messaging_system_derived__cfr_users_daily__v1.set_upstream(
        wait_for_copy_deduplicate_all)

    messaging_system_derived__cfr_users_last_seen__v1.set_upstream(
        messaging_system_derived__cfr_users_daily__v1)

    messaging_system_derived__onboarding_users_daily__v1.set_upstream(
        wait_for_copy_deduplicate_all)

    messaging_system_derived__onboarding_users_last_seen__v1.set_upstream(
            default_args=rocket_args,
            project='moz-fx-data-shared-prod',
            dataset='telemetry',
            table_or_view='rocket_android_events_v1',
            s3_prefix='rocket_android',
        ),
        task_id=rocket_android_task_id
    )

    # DevTools view merges events from `telemetry.main` and `telemetry.event`.
    # We need to make sure both tables are ready and deduplicated before proceeding.
    wait_for_copy_deduplicate_all = ExternalTaskSensor(
        task_id="wait_for_copy_deduplicate_all",
        external_dag_id="copy_deduplicate",
        external_task_id="copy_deduplicate_all",
        execution_delta=datetime.timedelta(hours=1),
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
        email_on_retry=False,
        dag=dag)
    wait_for_copy_deduplicate_main_ping = ExternalTaskSensor(
        task_id="wait_for_copy_deduplicate_main_ping",
        external_dag_id="copy_deduplicate",
        external_task_id="copy_deduplicate_main_ping",
        execution_delta=datetime.timedelta(hours=1),
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
        email_on_retry=False,
        dag=dag,
    )
Esempio n. 24
0
    ],
    "org_mozilla_fenix_glam_beta":
    ["org_mozilla_fenix", "org_mozilla_firefox_beta"],
    "org_mozilla_fenix_glam_release": ["org_mozilla_firefox"],
}

dag = DAG("glam_fenix",
          default_args=default_args,
          schedule_interval="0 2 * * *")

wait_for_copy_deduplicate = ExternalTaskSensor(
    task_id="wait_for_copy_deduplicate",
    external_dag_id="copy_deduplicate",
    external_task_id="copy_deduplicate_all",
    execution_delta=timedelta(hours=1),
    check_existence=True,
    mode="reschedule",
    pool="DATA_ENG_EXTERNALTASKSENSOR",
    email_on_retry=False,
    dag=dag,
)

mapping = {}
for product in PRODUCTS:
    query = generate_and_run_glean_query(
        task_id=f"daily_{product}",
        product=product,
        destination_project_id=PROJECT,
        env_vars=dict(STAGE="daily"),
        dag=dag,
    )
Esempio n. 25
0
            "--model-input-table-id",
            "ltv_daily_model_perf",
            "--model-output-table-id",
            "ltv_daily",
            "--temporary-gcs-bucket",
            params.storage_bucket,
        ],
        gcp_conn_id=params.conn_id,
        service_account=params.client_email,
        artifact_bucket=params.artifact_bucket,
        storage_bucket=params.storage_bucket,
        default_args=subdag_args,
    ),
)

if params.is_dev:
    copy_to_dev = copy_artifacts_dev(dag, params.project_id,
                                     params.artifact_bucket,
                                     params.storage_bucket)
    copy_to_dev >> ltv_daily
else:
    wait_for_search_clients_last_seen = ExternalTaskSensor(
        task_id="wait_for_search_clients_last_seen",
        external_dag_id="main_summary",
        external_task_id="search_clients_last_seen",
        execution_delta=timedelta(hours=-1),
        check_existence=True,
        dag=dag,
    )
    wait_for_search_clients_last_seen >> ltv_daily
Esempio n. 26
0
        task_id="event_events",
        project_id="moz-fx-data-shared-prod",
        destination_table="event_events_v1",
        dataset_id="telemetry_derived",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        arguments=('--schema_update_option=ALLOW_FIELD_ADDITION', ),
    )

    copy_deduplicate_all >> event_events

    # Experiment enrollment aggregates chain (depends on events)

    wait_for_main_events = ExternalTaskSensor(
        task_id="wait_for_main_events",
        external_dag_id="main_summary",
        external_task_id="bq_main_events",
        dag=dag)

    experiment_enrollment_aggregates = bigquery_etl_query(
        task_id="experiment_enrollment_aggregates",
        project_id="moz-fx-data-shared-prod",
        destination_table="experiment_enrollment_aggregates_v1",
        dataset_id="telemetry_derived",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"])

    gen_query_task_id = "experiment_enrollment_aggregates_live_generate_query"

    # setting xcom_push to True outputs this query to an xcom
    experiment_enrollment_aggregates_live_generate_query = gke_command(
Esempio n. 27
0
    'retries': 3
}

dag_core_id = TEST_DAG_ID + '_core'
dag_core = DAG(dag_core_id,
               default_args=args,
               schedule_interval=timedelta(seconds=1))
task_core = DummyOperator(task_id='task_core', dag=dag_core)

dag_first_child_id = TEST_DAG_ID + '_first_child'
dag_first_child = DAG(dag_first_child_id,
                      default_args=args,
                      schedule_interval=timedelta(seconds=1))
t1_first_child = ExternalTaskSensor(task_id='t1_first_child',
                                    external_dag_id=dag_core_id,
                                    external_task_id='task_core',
                                    poke_interval=1,
                                    dag=dag_first_child,
                                    depends_on_past=True)
t2_first_child = DummyOperator(task_id='t2_first_child',
                               dag=dag_first_child,
                               depends_on_past=True)
t2_first_child.set_upstream(t1_first_child)

dag_second_child_id = TEST_DAG_ID + '_second_child'
dag_second_child = DAG(dag_second_child_id,
                       default_args=args,
                       schedule_interval=timedelta(seconds=1))
t1_second_child = ExternalTaskSensor(task_id='t1_second_child',
                                     external_dag_id=dag_first_child_id,
                                     external_task_id='t2_first_child',
                                     poke_interval=1,
Esempio n. 28
0
CleanUpSKOPages = TriggerDagRunOperator(task_id='CleanUpSKOPages',
                                        trigger_dag_id="01_CleanupDag",
                                        python_callable=trigger_dag_run_pass_params,
                                        params={},
                                        dag=dag)

ImportComtelData = TriggerDagRunOperator(task_id='ImportComtelData',
                                         trigger_dag_id="02_ImportDataDag",
                                         python_callable=trigger_dag_run_pass_params,
                                         params={},
                                         dag=dag)

WaitForImportComtelData = ExternalTaskSensor(task_id='WaitForImportComtelData',
                                             external_dag_id='02_ImportDataDag',
                                             external_task_id='dag_complete',
                                             execution_delta=None,  # Same day as today
                                             dag=dag)

ImportProgramLogMinus14 = TriggerDagRunOperator(task_id='ImportProgramLogMinus14',
                                                trigger_dag_id="03_ImportProgramLog",
                                                python_callable=trigger_dag_run_pass_params,
                                                params={'daysDelta': 14},
                                                dag=dag)

WaitForImportProgramLogMinus14 = ExternalTaskSensor(task_id='WaitForImportProgramLogMinus14',
                                                    external_dag_id='03_ImportProgramLog',
                                                    external_task_id='dag_complete',
                                                    execution_delta=None,  # Same day as today
                                                    dag=dag)
Esempio n. 29
0
    org_mozilla_vrbrowser_derived__clients_daily__v1 = bigquery_etl_query(
        task_id="org_mozilla_vrbrowser_derived__clients_daily__v1",
        destination_table="clients_daily_v1",
        dataset_id="org_mozilla_vrbrowser_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    wait_for_copy_deduplicate_copy_deduplicate_all = ExternalTaskSensor(
        task_id="wait_for_copy_deduplicate_copy_deduplicate_all",
        external_dag_id="copy_deduplicate",
        external_task_id="copy_deduplicate_all",
        check_existence=True,
        dag=dag,
    )

    org_mozilla_vrbrowser_derived__baseline_daily__v1.set_upstream(
        wait_for_copy_deduplicate_copy_deduplicate_all)

    org_mozilla_vrbrowser_derived__metrics_daily__v1.set_upstream(
        wait_for_copy_deduplicate_copy_deduplicate_all)

    org_mozilla_vrbrowser_derived__clients_last_seen__v1.set_upstream(
        org_mozilla_vrbrowser_derived__clients_daily__v1)

    org_mozilla_vrbrowser_derived__clients_daily__v1.set_upstream(
        org_mozilla_vrbrowser_derived__baseline_daily__v1)
Esempio n. 30
0
    'email_on_retry': False,
    'email': ['*****@*****.**']
}

dag = DAG(
    'etl_orders',
    schedule_interval='@hourly',
    catchup=False,
    default_args=default_args,
    description='Loads newly placed orders daily.'
)

# Wait for new_food_deliveries DAG to complete
t1 = ExternalTaskSensor(
    task_id='wait_for_new_food_deliveries',
    external_dag_id='new_food_deliveries',
    mode='reschedule',
    dag=dag
)

# Wait for etl_menu_items DAG to complete
t2 = ExternalTaskSensor(
    task_id='wait_for_etl_menu_items',
    external_dag_id='etl_menu_items',
    mode='reschedule',
    dag=dag
)

t3 = PostgresOperator(
    task_id='if_not_exists',
    postgres_conn_id='food_delivery_db',
    sql='''
Esempio n. 31
0
default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': True,
    'start_date': datetime(2017, 1, 1),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('fx_usage_report', default_args=default_args, schedule_interval='@weekly')

wait_for_main_summary = ExternalTaskSensor(
    task_id='wait_for_main_summary',
    external_dag_id='main_summary',
    external_task_id='main_summary',
    execution_delta=timedelta(days=-7, hours=-1), # main_summary waits one hour, execution date is beginning of the week
    dag=dag)

usage_report = EMRSparkOperator(
    task_id="fx_usage_report",
    job_name="Fx Usage Report",
    execution_timeout=timedelta(hours=4),
    instance_count=10,
    release_label="emr-5.11.0",
    owner="*****@*****.**",
    email=["*****@*****.**", "*****@*****.**", "*****@*****.**"],
    env={"date": DS_WEEKLY,
         "bucket": "{{ task.__class__.public_output_bucket }}",
         "deploy_environment": "{{ task.__class__.deploy_environment }}"},
    uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/fx_usage_report.sh",