Esempio n. 1
0
start_date = datetime.datetime.utcnow()

dag = DAG(
    "lesson3.exercise3",
    start_date=start_date,
)

trips_task_id = "trips_subdag"
trips_subdag_task = SubDagOperator(
    subdag=get_s3_to_redshift_dag(
        "lesson3.exercise3",
        trips_task_id,
        "redshift",
        "aws_credentials",
        "trips",
        sql_statements.CREATE_TRIPS_TABLE_SQL,
        s3_bucket="udac-data-pipelines",
        s3_key="divvy/unpartitioned/divvy_trips_2018.csv",
        start_date=start_date,
    ),
    task_id=trips_task_id,
    dag=dag,
)

stations_task_id = "stations_subdag"
stations_subdag_task = SubDagOperator(
    subdag=get_s3_to_redshift_dag(
        "lesson3.exercise3",
        stations_task_id,
        "redshift",
        "aws_credentials",
Esempio n. 2
0
    transfer_options={'deleteObjectsUniqueInSink': True},
    dag=dag,
)

# Spark job reads gcs json and writes gcs parquet
crash_report_parquet = SubDagOperator(
    task_id="crash_report_parquet",
    dag=dag,
    subdag=moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,
        dag_name='crash_report_parquet',
        default_args=default_args,
        cluster_name=cluster_name,
        job_name="Socorro_Crash_Reports_to_Parquet",
        python_driver_code=
        "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/socorro_import_crash_data.py",
        py_args=[
            "--date", "{{ ds_nodash }}", "--source-gcs-path",
            "gs://{}/v1/crash_report".format(gcs_data_bucket),
            "--dest-gcs-path", "gs://{}/{}".format(gcs_data_bucket, dataset)
        ],
        idle_delete_ttl='14400',
        num_workers=8,
        worker_machine_type='n1-standard-8',
        aws_conn_id=read_aws_conn_id,
        gcp_conn_id=gcp_conn_id))

bq_gcp_conn_id = 'google_cloud_derived_datasets'
bq_connection = GoogleCloudBaseHook(gcp_conn_id=bq_gcp_conn_id)
gke_location = "us-central1-a"
gke_cluster_name = "bq-load-gke-1"
            "to": "{{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}",
            "doc-type": "first_shutdown",
            "read-mode": "aligned",
            "input-partition-multiplier": "4"
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

first_shutdown_summary_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="first_shutdown_summary_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="first_shutdown_summary",
        dataset_version="v4",
        gke_cluster_name="bq-load-gke-1",
        bigquery_dataset="telemetry_derived",
        cluster_by=["sample_id"],
        drop=["submission_date"],
        rename={"submission_date_s3": "submission_date"},
        replace=["SAFE_CAST(sample_id AS INT64) AS sample_id"],
    ),
    task_id="first_shutdown_summary_bigquery_load",
    dag=dag)

first_shutdown_summary >> first_shutdown_summary_bigquery_load
Esempio n. 4
0
PROJECT_VERSION = '1.0'
PROJECT_NAME = 'post-collector'

# MAIN DAGS
# interval = "0 3 */1 * *"
interval = "*/10 * * * *"
DAG_ID = 'post_collector'
start_date = datetime.strptime(Variable.get("post_collector_start_date"), "%Y-%m-%d %H:%M:%S")
emails = Variable.get('support_email_list').split(',')
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': start_date,
    'email': emails,
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 2,
    'retry_delay': timedelta(minutes=2)
}

with DAG(dag_id=DAG_ID, default_args=default_args, schedule_interval=interval, start_date=start_date) as dag:

    main_subdags_id = 'all_process'
    process_keywords_dag = SubDagOperator(
        task_id=main_subdags_id,
        subdag=all_process(
            "{0}.{1}".format(DAG_ID, main_subdags_id), start_date, interval, default_args),
        depends_on_past=True,
        dag=dag
    )
begin_execution = DummyOperator(task_id='begin_execution', dag=dag)

end_execution = DummyOperator(task_id='end_execution', dag=dag)

staging_events_task_id = "staging_events_subdag"
staging_events_subdag_task = SubDagOperator(
    subdag=stg_subdag(
        "a_song_plays_hourly",
        staging_events_task_id,
        "aws_credentials",
        "redshift",
        "staging_events",
        sql_statements.CREATE_TABLE_STAGING_EVENTS,
        s3_prefix="s3:/",
        s3_bucket="udacity-dend",
        s3_key="log_data/{execution_date.year}/{execution_date.month}",
        s3_jsonpath_file="log_json_path.json",
        sw_delete_stages=sw_delete_stages,
        partition_year="{execution_date.year}",
        partition_month="{execution_date.month}",
        start_date=start_date,
    ),
    task_id=staging_events_task_id,
    dag=dag,
)

staging_songs_task_id = "staging_songs_subdag"
staging_songs_subdag_task = SubDagOperator(
    subdag=stg_subdag(
        "a_song_plays_hourly",
        staging_songs_task_id,
Esempio n. 6
0
    src="/airflow/dags/spark-scripts/generate_show_comments.py",
    dst="spark-jobs/generate_show_comments.py",
    bucket=gcs_netflix_bucket,
    google_cloud_storage_conn_id=gcp_conn,
    dag=dag)

catalog_task_id = "show_catalog_subdag"
catalog_path = "catalog/clean/catalog.parquet"
download_catalog_show_subdag = SubDagOperator(subdag=catalog_show_to_gcs(
    "content_review",
    catalog_task_id,
    kaggle_bucket="shivamb/netflix-shows",
    kaggle_local_destination_path="/airflow/datasources/catalog/csv",
    gcp_conn_id=gcp_conn,
    gcs_bucket=gcs_netflix_bucket,
    gcs_raw_destination_path="catalog/raw/catalog.csv",
    gcs_clean_destination_path=catalog_path,
    cluster_name=cluster_name,
    spark_code_path="gs://" + gcs_netflix_bucket +
    "/spark-jobs/clean_netflix_catalog.py",
    region=region,
    start_date=start_date),
                                              task_id=catalog_task_id,
                                              dag=dag)

consume_show_comments_job_path = "gs://" + gcs_netflix_bucket + "/spark-jobs/consume_reddit_comments.py"
reddit_destination_path = "gs://" + gcs_netflix_bucket + "/comments/raw/comments.parquet"
gcp_netflix_catalog_path = "gs://" + gcs_netflix_bucket + "/" + catalog_path

consume_show_comment_to_datalake = DataProcPySparkOperator(
    task_id='consume_show_comment_to_datalake',
    main=consume_show_comments_job_path,
Esempio n. 7
0
        def nested_subdags():
            from airflow.models import DAG
            from airflow.operators.dummy_operator import DummyOperator
            from airflow.operators.subdag_operator import SubDagOperator
            import datetime
            DAG_NAME = 'master'
            DEFAULT_ARGS = {
                'owner': 'owner1',
                'start_date': datetime.datetime(2016, 1, 1)
            }
            dag = DAG(DAG_NAME, default_args=DEFAULT_ARGS)

            # master:
            #     A -> opSubdag_0
            #          master.opSubdag_0:
            #              -> opSubDag_A
            #                 master.opSubdag_0.opSubdag_A:
            #                     -> subdag_A.task
            #              -> opSubdag_B
            #                 master.opSubdag_0.opSubdag_B:
            #                     -> subdag_B.task
            #     A -> opSubdag_1
            #          master.opSubdag_1:
            #              -> opSubdag_C
            #                 master.opSubdag_1.opSubdag_C:
            #                     -> subdag_C.task
            #              -> opSubDag_D
            #                 master.opSubdag_1.opSubdag_D:
            #                     -> subdag_D.task

            with dag:

                def subdag_A():
                    subdag_A = DAG('master.opSubdag_0.opSubdag_A',
                                   default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_A.task', dag=subdag_A)
                    return subdag_A

                def subdag_B():
                    subdag_B = DAG('master.opSubdag_0.opSubdag_B',
                                   default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_B.task', dag=subdag_B)
                    return subdag_B

                def subdag_C():
                    subdag_C = DAG('master.opSubdag_1.opSubdag_C',
                                   default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_C.task', dag=subdag_C)
                    return subdag_C

                def subdag_D():
                    subdag_D = DAG('master.opSubdag_1.opSubdag_D',
                                   default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_D.task', dag=subdag_D)
                    return subdag_D

                def subdag_0():
                    subdag_0 = DAG('master.opSubdag_0',
                                   default_args=DEFAULT_ARGS)
                    SubDagOperator(task_id='opSubdag_A',
                                   dag=subdag_0,
                                   subdag=subdag_A())
                    SubDagOperator(task_id='opSubdag_B',
                                   dag=subdag_0,
                                   subdag=subdag_B())
                    return subdag_0

                def subdag_1():
                    subdag_1 = DAG('master.opSubdag_1',
                                   default_args=DEFAULT_ARGS)
                    SubDagOperator(task_id='opSubdag_C',
                                   dag=subdag_1,
                                   subdag=subdag_C())
                    SubDagOperator(task_id='opSubdag_D',
                                   dag=subdag_1,
                                   subdag=subdag_D())
                    return subdag_1

                opSubdag_0 = SubDagOperator(task_id='opSubdag_0',
                                            dag=dag,
                                            subdag=subdag_0())
                opSubdag_1 = SubDagOperator(task_id='opSubdag_1',
                                            dag=dag,
                                            subdag=subdag_1())

                opA = DummyOperator(task_id='A')
                opA.set_downstream(opSubdag_0)
                opA.set_downstream(opSubdag_1)

            return dag
Esempio n. 8
0
def load_to_bigquery(parent_dag_name=None,
                     default_args=None,
                     dataset_s3_bucket=None,
                     aws_conn_id=None,
                     dataset=None,
                     dataset_version=None,
                     gke_cluster_name=None,
                     date_submission_col='submission_date_s3',
                     ds_type='ds_nodash',
                     dag_name='load_to_bigquery',
                     gke_location='us-central1-a',
                     gke_namespace='default',
                     docker_image='docker.io/mozilla/parquet2bigquery:20190910', # noqa
                     reprocess=False,
                     p2b_concurrency='10',
                     p2b_resume=False,
                     p2b_table_alias=None,
                     objects_prefix=None,
                     spark_gs_dataset_location=None,
                     bigquery_dataset='telemetry',
                     dataset_gcs_bucket='moz-fx-data-derived-datasets-parquet',
                     gcp_conn_id='google_cloud_derived_datasets',
                     cluster_by=(),
                     drop=(),
                     rename={},
                     replace=()):

    """ Load Parquet data into BigQuery. Used with SubDagOperator.

    We use S3ToGoogleCloudStorageTransferOperator to create a GCS Transfer
    Service job to transfer the AWS S3 parquet data into a GCS Bucket.
    Once that is completed we launch a Kubernates pod on a existing GKE
    cluster using the GKEPodOperator.

    :param str parent_dag_name:            parent dag name
    :param dict default_args:              dag configuration
    :param str dataset_s3_bucket:          source S3 Bucket
    :param str dataset_gcs_bucket:         destination GCS Bucket
    :param str aws_conn_id:                airflow connection id for S3 access
    :param str gcp_conn_id:                airflow connection id for GCP access
    :param str dataset:                    dataset name
    :param str dataset_version:            dataset version
    :param str date_submission_col:        dataset date submission column
    :param str ds_type:                    dataset format (ds or ds_nodash)
    :param str gke_location:               GKE cluster zone
    :param str gke_namespace:              GKE cluster namespace
    :param str docker_image:               docker image to use for GKE pod operations # noqa
    :param str bigquery_dataset:           bigquery load destination dataset
    :param str p2b_concurrency:            number of processes for parquet2bigquery load
    :param str p2b_table_alias:            override p2b table name with alias
    :param str p2b_resume                  allow resume support. defaults to False
    :param bool reprocess:                 enable dataset reprocessing defaults to False
    :param str objects_prefix:             custom objects_prefix to override defaults
    :param str spark_gs_dataset_location:  custom spark dataset load location to override defaults
    :param List[str] cluster_by:           top level fields to cluster by when creating destination table
    :param List[str] drop:                 top level fields to exclude from destination table
    :param Dict[str, str] rename:          top level fields to rename in destination table
    :param List[str] replace:              top level field replacement expressions

    :return airflow.models.DAG
    """

    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    _dag_name = '{}.{}'.format(parent_dag_name, dag_name)

    if objects_prefix:
        _objects_prefix = objects_prefix
    else:
        _objects_prefix = '{}/{}/{}={{{{{}}}}}'.format(dataset,
                                                       dataset_version,
                                                       date_submission_col,
                                                       ds_type)
    gcs_buckets = {
        'transfer': dataset_gcs_bucket,
        'load': dataset_gcs_bucket,
    }

    gcstj_object_conditions = {
        'includePrefixes':  _objects_prefix
    }

    gcstj_transfer_options = {
        'deleteObjectsUniqueInSink': True
    }

    gke_args = [
        '-d', bigquery_dataset,
        '-c', p2b_concurrency,
        '-b', gcs_buckets['load'],
        ]

    if not p2b_resume:
        gke_args += ['-R']

    if p2b_table_alias:
        gke_args += ['-a', p2b_table_alias]

    if reprocess:
        reprocess_objects_prefix = _objects_prefix.replace('_nodash', '')
        gcs_buckets['transfer'] += '-tmp'
        gke_args += ['-p', reprocess_objects_prefix]

    else:
        gke_args += ['-p', _objects_prefix]

    if cluster_by:
        gke_args += ['--cluster-by'] + cluster_by

    if drop:
        gke_args += ['--drop'] + drop

    if rename:
        gke_args += ['--rename'] + [k + "=" + v for k, v in rename.items()]

    if replace:
        gke_args += ['--replace'] + replace

    bq_table_name = p2b_table_alias or normalize_table_id('_'.join([dataset,
                                                                   dataset_version]))

    with models.DAG(_dag_name, default_args=default_args) as dag:
        s3_to_gcs = S3ToGoogleCloudStorageTransferOperator(
            task_id='s3_to_gcs',
            s3_bucket=dataset_s3_bucket,
            gcs_bucket=gcs_buckets['transfer'],
            description=_objects_prefix,
            aws_conn_id=aws_conn_id,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            object_conditions=gcstj_object_conditions,
            transfer_options=gcstj_transfer_options
            )

        reprocess = SubDagOperator(
            subdag=reprocess_parquet(
                _dag_name,
                default_args,
                reprocess,
                gcp_conn_id,
                gcs_buckets,
                _objects_prefix,
                date_submission_col,
                dataset,
                dataset_version,
                gs_dataset_location=spark_gs_dataset_location),
            task_id='reprocess_parquet')

        remove_bq_table = BigQueryTableDeleteOperator(
            task_id='remove_bq_table',
            bigquery_conn_id=gcp_conn_id,
            deletion_dataset_table='{}.{}${{{{ds_nodash}}}}'.format(bigquery_dataset, bq_table_name), # noqa
            ignore_if_missing=True,
        )

        bulk_load = GKEPodOperator(
            task_id='bigquery_load',
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            location=gke_location,
            cluster_name=gke_cluster_name,
            name=_dag_name.replace('_', '-'),
            namespace=gke_namespace,
            image=docker_image,
            arguments=gke_args,
            )

        s3_to_gcs >> reprocess >> remove_bq_table >> bulk_load

        return dag
register_status(main_summary, "Main Summary", "A summary view of main pings.")

main_summary_schema = EmailSchemaChangeOperator(
    task_id="main_summary_schema",
    email=["*****@*****.**", "*****@*****.**"],
    to=["*****@*****.**", "*****@*****.**"],
    key_prefix='schema/main_summary/submission_date_s3=',
    dag=dag)

main_summary_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="main_summary_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="main_summary",
        dataset_version="v4",
        gke_cluster_name="bq-load-gke-1",
        ),
    task_id="main_summary_bigquery_load",
    dag=dag)

engagement_ratio = EMRSparkOperator(
    task_id="engagement_ratio",
    job_name="Update Engagement Ratio",
    execution_timeout=timedelta(hours=6),
    instance_count=10,
    env=mozetl_envvar("engagement_ratio",
        options={
            "input_bucket": "{{ task.__class__.private_output_bucket }}",
Esempio n. 10
0
bgbb_fit_dataproc = SubDagOperator(
    task_id=task_id,
    dag=dag,
    subdag=moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,
        dag_name=task_id,
        job_name="bgbb_fit_dataproc",
        cluster_name="bgbb-fit-{{ ds_nodash }}",
        idle_delete_ttl="600",
        num_workers=3,
        worker_machine_type="n1-standard-8",
        init_actions_uris=[
            "gs://dataproc-initialization-actions/python/pip-install.sh"
        ],
        additional_properties={
            "spark:spark.jars":
            "gs://spark-lib/bigquery/spark-bigquery-latest.jar"
        },
        additional_metadata={
            "PIP_PACKAGES": "git+https://github.com/wcbeard/bgbb_airflow.git"
        },
        python_driver_code="gs://{}/jobs/bgbb_runner.py".format(
            params.artifact_bucket),
        py_args=[
            "bgbb_fit",
            "--submission-date",
            "{{ next_ds }}",
            "--model-win",
            "90",
            "--start-params",
            "[0.387, 0.912, 0.102, 1.504]",
            "--sample-ids",
            "[42]",
            "--sample-fraction",
            "1.0",
            "--penalizer-coef",
            "0.01",
            "--source",
            "bigquery",
            "--view-materialization-project",
            params.project_id if params.is_dev else "moz-fx-data-shared-prod",
            "--view-materialization-dataset",
            "analysis",
            "--bucket-protocol",
            "gs",
            "--bucket",
            params.output_bucket,
            "--prefix",
            "bgbb/params/v1",
        ],
        gcp_conn_id=params.conn_id,
        service_account=params.client_email,
        artifact_bucket=params.artifact_bucket,
        storage_bucket=params.storage_bucket,
        default_args=subdag_args,
    ),
)
Esempio n. 11
0
ltv_daily = SubDagOperator(
    task_id=task_id,
    dag=dag,
    subdag=moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,
        dag_name=task_id,
        job_name="ltv-daily",
        cluster_name="ltv-daily-{{ ds_nodash }}",
        idle_delete_ttl="600",
        num_workers=5,
        worker_machine_type="n1-standard-8",
        optional_components=["ANACONDA"],
        init_actions_uris=[
            "gs://dataproc-initialization-actions/python/pip-install.sh"
        ],
        additional_properties={
            "spark:spark.jars":
            "gs://spark-lib/bigquery/spark-bigquery-latest.jar"
        },
        additional_metadata={"PIP_PACKAGES": "lifetimes==0.11.1"},
        python_driver_code="gs://{}/jobs/ltv_daily.py".format(
            params.artifact_bucket),
        py_args=[
            "--submission-date",
            "{{ ds }}",
            "--prediction-days",
            "364",
            "--project-id",
            "moz-fx-data-shared-prod",
            "--source-qualified-table-id",
            "moz-fx-data-shared-prod.search.search_rfm",
            "--dataset-id",
            "analysis",
            "--intermediate-table-id",
            "ltv_daily_temporary_search_rfm_day",
            "--model-input-table-id",
            "ltv_daily_model_perf",
            "--model-output-table-id",
            "ltv_daily",
            "--temporary-gcs-bucket",
            params.storage_bucket,
        ],
        gcp_conn_id=params.conn_id,
        service_account=params.client_email,
        artifact_bucket=params.artifact_bucket,
        storage_bucket=params.storage_bucket,
        default_args=subdag_args,
    ),
)
    format_type="json",
    format_style="auto")

load_songplays_table = LoadFactOperator(
    task_id='Load_songplays_fact_table',
    dag=dag,
    target_table="songplays",
    redshift_conn_id="redshift",
    select_sql_stmt=SqlQueries.songplay_table_insert,
)

dim_task_id = "load_data_into_dimension_tables"
load_dimension_subdag_task = SubDagOperator(
    subdag=load_to_dimension_tables_dag("sparkify_pipeline_3",
                                        dim_task_id,
                                        "redshift",
                                        SqlQueries,
                                        start_date=start_date),
    task_id=dim_task_id,
    dag=dag)

dq_checks = [{
    'check_sql': "SELECT COUNT(*) FROM songplays;",
    'test_expr': "{} < 1"
}, {
    'check_sql': "SELECT COUNT(*) FROM users WHERE userid is NULL;",
    'test_expr': "{} >= 1"
}]

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         dag=dag,
                                         redshift_conn_id="redshift",
    provide_context=True,
    python_callable=generate_search_terms,
    dag=dag,
)

email_links = EmailOperator(
    task_id="email_best_links",
    to="*****@*****.**",
    subject="Latest popular links",
    html_content="Check out the latest!!",
    files=["{}/latest_links.txt".format(RAW_TWEET_DIR)],
    dag=dag,
)

sub = SubDagOperator(subdag=subdag,
                     task_id="insert_and_id_pop",
                     trigger_rule="one_success",
                     dag=dag)

clear_latest = BashOperator(
    bash_command="rm -rf {}/latest_links.txt".format(RAW_TWEET_DIR),
    task_id="clear_latest",
    dag=dag,
)

gen_search_terms.set_upstream(fill_search_terms)

for term in SEARCH_TERMS:
    term_without_punctuation = re.sub(r"\W+", "", term)
    simple_search = PythonOperator(
        task_id="search_{}_twitter".format(term_without_punctuation),
        provide_context=True,
Esempio n. 14
0
def repeat_dag(context, dag_run_obj):
    rq = context['params']['rq']
    if rq.QueueSize() > 0:
        return dag_run_obj


# @TODO find a way to make these separate tasks.  Difficult because they
#  can't be pickled, therefore they can't be returned via a task.
Session, _ = db_connect('pdsdi_dev')
session = Session()
rq = RedisQueue('DI_ReadyQueue')

process_operator = SubDagOperator(subdag=process_subdag('di_process',
                                                        'di_checksum',
                                                        session=session,
                                                        archiveID=archiveID,
                                                        n_procs=5,
                                                        rq=rq),
                                  task_id='di_checksum',
                                  dag=dag)

loop_operator = TriggerDagRunOperator(task_id='loop',
                                      provide_context=True,
                                      params={'rq': rq},
                                      trigger_dag_id='di_process',
                                      python_callable=repeat_dag,
                                      dag=dag)

process_operator >> loop_operator
Esempio n. 15
0
register_status(main_summary, "Main Summary", "A summary view of main pings.")

main_summary_schema = EmailSchemaChangeOperator(
    task_id="main_summary_schema",
    email=["*****@*****.**", "*****@*****.**"],
    to=["*****@*****.**", "*****@*****.**"],
    key_prefix='schema/main_summary/submission_date_s3=',
    dag=dag)

main_summary_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="main_summary_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="main_summary",
        dataset_version="v4",
        gke_cluster_name="bq-load-gke-1",
        ),
    task_id="main_summary_bigquery_load",
    dag=dag)

engagement_ratio = EMRSparkOperator(
    task_id="engagement_ratio",
    job_name="Update Engagement Ratio",
    execution_timeout=timedelta(hours=6),
    instance_count=10,
    env=mozetl_envvar("engagement_ratio",
        options={
            "input_bucket": "{{ task.__class__.private_output_bucket }}",
    for i in range(2):
        DummyOperator(
            task_id='%s-task-%s' % (child_dag_name, i + 1),
            default_args=args,
            dag=dag_subdag,
        )

    return dag_subdag


with DAG(
        dag_id=DAG_NAME,
        start_date=datetime(2019, 1, 1),
        max_active_runs=1,
        default_args=DEFAULT_TASK_ARGS,
        schedule_interval=timedelta(minutes=1),
) as dag:

    start = DummyOperator(task_id='start', )

    section_1 = SubDagOperator(
        task_id='section-1',
        subdag=subdag(DAG_NAME, 'section-1', DEFAULT_TASK_ARGS),
        default_args=DEFAULT_TASK_ARGS,
    )

    some_other_task = DummyOperator(task_id='some-other-task', )

    start >> section_1 >> some_other_task  # pylint: disable=W0104
Esempio n. 17
0
# start_date = datetime.datetime.utcnow()
start_date = datetime.datetime(2018, 1, 1, 0, 0, 0, 0)
end_date = datetime.datetime(2018, 6, 1, 0, 0, 0, 0)

dag = DAG("lesson3.exercise3",
          start_date=start_date,
          end_date=end_date,
          schedule_interval="@monthly")

trips_task_id = "trips_subdag"
trips_subdag_task = SubDagOperator(subdag=get_s3_to_redshift_dag(
    parent_dag_name="lesson3.exercise3",
    task_id=trips_task_id,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials_redshift",
    table="trips",
    create_sql_stmt=sql_statements.CREATE_TRIPS_TABLE_SQL,
    s3_bucket="udacity-dend",
    s3_key="udac-data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv",
    start_date=start_date),
                                   task_id=trips_task_id,
                                   dag=dag)

stations_task_id = "stations_subdag"
stations_subdag_task = SubDagOperator(subdag=get_s3_to_redshift_dag(
    parent_dag_name="lesson3.exercise3",
    task_id=stations_task_id,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials_redshift",
    table="stations",
    create_sql_stmt=sql_statements.CREATE_STATIONS_TABLE_SQL,
    s3_bucket="udacity-dend",
load_songplays_table = LoadFactOperator(
    task_id='Load_songplays_fact_table',
    dag=dag,
    redshift_conn_id="redshift",
    table="songplays",
    sql_source=SqlQueries.songplay_table_insert)

# Create & Load dimension tables
# artists table
load_artists_dimension_table_task_id = "artists_subdag"
load_artists_dimension_table = SubDagOperator(
    subdag=load_dim_table_dag(parent_dag_name=parent_task_id,
                              task_id=load_artists_dimension_table_task_id,
                              redshift_conn_id="redshift",
                              table="artists",
                              create_sql_stmt=CreateTables.create_artists,
                              select_stmt=SqlQueries.artist_table_insert,
                              append_rows=False,
                              start_date=start_date),
    task_id=load_artists_dimension_table_task_id,
    dag=dag,
)

# songs table
load_songs_dimension_table_task_id = "songs_subdag"
load_songs_dimension_table = SubDagOperator(
    subdag=load_dim_table_dag(parent_dag_name=parent_task_id,
                              task_id=load_songs_dimension_table_task_id,
                              redshift_conn_id="redshift",
                              table="songs",
                              create_sql_stmt=CreateTables.create_songs,
                              select_stmt=SqlQueries.song_table_insert,
Esempio n. 19
0
        "com.mozilla.telemetry.views.SyncView", {
            "from": "{{ ds_nodash }}",
            "to": "{{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}"
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

sync_view_bigquery_load = SubDagOperator(subdag=load_to_bigquery(
    parent_dag_name=dag.dag_id,
    dag_name="sync_view_bigquery_load",
    default_args=default_args,
    dataset_s3_bucket="telemetry-parquet",
    aws_conn_id="aws_dev_iam_s3",
    dataset="sync_summary",
    dataset_version="v2",
    gke_cluster_name="bq-load-gke-1",
    bigquery_dataset="telemetry_derived",
),
                                         task_id="sync_view_bigquery_load",
                                         dag=dag)

sync_events_view = EMRSparkOperator(
    task_id="sync_events_view",
    job_name="Sync Events View",
    execution_timeout=timedelta(hours=10),
    instance_count=1,
    email=['*****@*****.**'],
    env=tbv_envvar(
        "com.mozilla.telemetry.views.SyncEventView", {
Esempio n. 20
0
####################################################################################

# task created by instantiating operators DummmyOerator
start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

# Taks to create, insert from s3_bucket/udacity-dend/log_data and check staging songs table
staging_songs_task_id = "staging_songs_subdag"
staging_songs_task = SubDagOperator(
    subdag=get_s3_to_redshift_subdag(
        "ETL_Sparkify_0",  #name parent dag
        staging_songs_task_id,  #task_id
        "redshift",  #redshift_conn_id
        "aws_credential",  #aws_credentials_id
        create_tbl=CreateTables.staging_songs_table_create,
        target_table="staging_songs",
        sql_row=SqlQueries.has_rows,
        s3_bucket="udacity-dend",
        s3_key="song_data",
        custom=" json 'auto' compupdate off region 'us-west-2'",
        start_date=datetime.datetime(2018, 11, 1, 0, 0, 0, 0),
    ),
    task_id=staging_songs_task_id,
    depends_on_past=True,
    dag=dag)
#Taks to create, insert from s3_bucket/udacity-dend/log_data and check staging events table
staging_events_task_id = "staging_events_subdag"
staging_events_task = SubDagOperator(
    subdag=get_s3_to_redshift_subdag(
        "ETL_Sparkify_0",  #name parent dag
        staging_events_task_id,  #task_id
        "redshift",  #redshift_conn_id
Esempio n. 21
0
        def nested_subdag_cycle():
            from airflow.models import DAG
            from airflow.operators.dummy_operator import DummyOperator
            from airflow.operators.subdag_operator import SubDagOperator
            import datetime
            DAG_NAME = 'nested_cycle'
            DEFAULT_ARGS = {
                'owner': 'owner1',
                'start_date': datetime.datetime(2016, 1, 1)
            }
            dag = DAG(DAG_NAME, default_args=DEFAULT_ARGS)

            # cycle:
            #     A -> opSubdag_0
            #          cycle.opSubdag_0:
            #              -> opSubDag_A
            #                 cycle.opSubdag_0.opSubdag_A:
            #                     -> subdag_A.task
            #              -> opSubdag_B
            #                 cycle.opSubdag_0.opSubdag_B:
            #                     -> subdag_B.task
            #     A -> opSubdag_1
            #          cycle.opSubdag_1:
            #              -> opSubdag_C
            #                 cycle.opSubdag_1.opSubdag_C:
            #                     -> subdag_C.task -> subdag_C.task  >Invalid Loop<
            #              -> opSubDag_D
            #                 cycle.opSubdag_1.opSubdag_D:
            #                     -> subdag_D.task

            with dag:

                def subdag_A():
                    subdag_A = DAG('nested_cycle.opSubdag_0.opSubdag_A',
                                   default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_A.task', dag=subdag_A)
                    return subdag_A

                def subdag_B():
                    subdag_B = DAG('nested_cycle.opSubdag_0.opSubdag_B',
                                   default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_B.task', dag=subdag_B)
                    return subdag_B

                def subdag_C():
                    subdag_C = DAG('nested_cycle.opSubdag_1.opSubdag_C',
                                   default_args=DEFAULT_ARGS)
                    opSubdag_C_task = DummyOperator(task_id='subdag_C.task',
                                                    dag=subdag_C)
                    # introduce a loop in opSubdag_C
                    opSubdag_C_task.set_downstream(opSubdag_C_task)
                    return subdag_C

                def subdag_D():
                    subdag_D = DAG('nested_cycle.opSubdag_1.opSubdag_D',
                                   default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_D.task', dag=subdag_D)
                    return subdag_D

                def subdag_0():
                    subdag_0 = DAG('nested_cycle.opSubdag_0',
                                   default_args=DEFAULT_ARGS)
                    SubDagOperator(task_id='opSubdag_A',
                                   dag=subdag_0,
                                   subdag=subdag_A())
                    SubDagOperator(task_id='opSubdag_B',
                                   dag=subdag_0,
                                   subdag=subdag_B())
                    return subdag_0

                def subdag_1():
                    subdag_1 = DAG('nested_cycle.opSubdag_1',
                                   default_args=DEFAULT_ARGS)
                    SubDagOperator(task_id='opSubdag_C',
                                   dag=subdag_1,
                                   subdag=subdag_C())
                    SubDagOperator(task_id='opSubdag_D',
                                   dag=subdag_1,
                                   subdag=subdag_D())
                    return subdag_1

                opSubdag_0 = SubDagOperator(task_id='opSubdag_0',
                                            dag=dag,
                                            subdag=subdag_0())
                opSubdag_1 = SubDagOperator(task_id='opSubdag_1',
                                            dag=dag,
                                            subdag=subdag_1())

                opA = DummyOperator(task_id='A')
                opA.set_downstream(opSubdag_0)
                opA.set_downstream(opSubdag_1)

            return dag
Esempio n. 22
0
prerelease_telemetry_aggregate_view_dataproc = SubDagOperator(
    task_id=task_id,
    dag=dag,
    subdag=moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,
        dag_name=task_id,
        job_name="prerelease_aggregates",
        cluster_name="prerelease-telemetry-aggregates-{{ ds_nodash }}",
        idle_delete_ttl="600",
        num_workers=10,
        worker_machine_type="n1-standard-8",
        init_actions_uris=[
            "gs://dataproc-initialization-actions/python/pip-install.sh"
        ],
        additional_properties={
            "spark:spark.jars": "gs://spark-lib/bigquery/spark-bigquery-latest.jar",
            "spark:spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.4",
        },
        additional_metadata={
            "PIP_PACKAGES": "git+https://github.com/mozilla/python_mozaggregator.git"
        },
        python_driver_code="gs://{}/jobs/mozaggregator_runner.py".format(
            artifact_bucket
        ),
        py_args=[
            "aggregator",
            "--date",
            "{{ ds_nodash }}",
            "--channels",
            "nightly,aurora,beta",
            "--postgres-db",
            "telemetry",
            "--postgres-user",
            "root",
            "--postgres-pass",
            "{{ var.value.mozaggregator_postgres_pass }}",
            "--postgres-host",
            "{{ var.value.mozaggregator_postgres_host }}",
            "--postgres-ro-host",
            "{{ var.value.mozaggregator_postgres_ro_host }}",
            "--num-partitions",
            str(10 * 32),
        ]
        + (
            ["--source", "bigquery", "--project-id", "moz-fx-data-shared-prod"]
            if not EXPORT_TO_AVRO
            else [
                "--source",
                "avro",
                "--avro-prefix",
                "gs://moz-fx-data-derived-datasets-parquet-tmp/avro/mozaggregator/prerelease/moz-fx-data-shared-prod",
            ]
        ),
        gcp_conn_id=gcp_conn.gcp_conn_id,
        service_account=client_email,
        artifact_bucket=artifact_bucket,
        storage_bucket=storage_bucket,
        default_args=subdag_args,
    ),
)
Esempio n. 23
0
    dag=dag6,
)
dag6_task2.set_upstream(dag6_task1)

# DAG tests that a deadlocked subdag is properly caught
dag7 = DAG(dag_id='test_subdag_deadlock', default_args=default_args)
subdag7 = DAG(dag_id='test_subdag_deadlock.subdag', default_args=default_args)
subdag7_task1 = PythonOperator(task_id='test_subdag_fail',
                               dag=subdag7,
                               python_callable=fail)
subdag7_task2 = DummyOperator(
    task_id='test_subdag_dummy_1',
    dag=subdag7,
)
subdag7_task3 = DummyOperator(task_id='test_subdag_dummy_2', dag=subdag7)
dag7_subdag1 = SubDagOperator(task_id='subdag', dag=dag7, subdag=subdag7)
subdag7_task1.set_downstream(subdag7_task2)
subdag7_task2.set_downstream(subdag7_task3)

# DAG tests that queued tasks are run
dag8 = DAG(dag_id='test_scheduled_queued_tasks',
           start_date=DEFAULT_DATE,
           end_date=DEFAULT_DATE,
           default_args=default_args)
dag8_task1 = PythonOperator(
    # use delayed_fail because otherwise LocalExecutor will have a chance to
    # complete the task
    python_callable=delayed_fail,
    task_id='test_queued_task',
    dag=dag8,
    pool='test_queued_pool')
Esempio n. 24
0
dag = DAG(
    dag_id='cm_load',
    default_args=args,
    schedule_interval=schedule_interval,
    start_date=datetime(2017, 1, 1),
    max_active_runs=1,
    # concurrency = 1,
    catchup=False,
    dagrun_timeout=timedelta(minutes=24 *
                             60))  # dag runs out after 1 day of running

sub_dag_extract_network_externals_task = SubDagOperator(
    subdag=extract_network_externals('cm_load',
                                     'extract_network_externals',
                                     start_date=dag.start_date,
                                     schedule_interval=dag.schedule_interval),
    task_id='extract_network_externals',
    dag=dag,
)

sub_dag_cm_load_house_keeping_task = SubDagOperator(
    subdag=run_house_keeping_tasks('cm_load',
                                   'cm_load_house_keeping',
                                   start_date=dag.start_date,
                                   schedule_interval=dag.schedule_interval),
    task_id='cm_load_house_keeping',
    dag=dag,
)

sub_dag_parse_and_import_eri_3g4g_cm_files = SubDagOperator(
    subdag=parse_and_import_eri_3g4g('cm_load',
Esempio n. 25
0
    task_id="clients_daily_keyed_histogram_aggregates",
    project_id=project_id,
    source_dataset_id=dataset_id,
    sample_size=PERCENT_RELEASE_WINDOWS_SAMPLING,
    overwrite=False,
    probe_type="keyed_histogram",
    get_logs=False,
    dag=dag,
)

clients_histogram_aggregates = SubDagOperator(
    subdag=histogram_aggregates_subdag(
        GLAM_DAG,
        GLAM_CLIENTS_HISTOGRAM_AGGREGATES_SUBDAG,
        default_args,
        dag.schedule_interval,
        dataset_id,
    ),
    task_id=GLAM_CLIENTS_HISTOGRAM_AGGREGATES_SUBDAG,
    executor=get_default_executor(),
    dag=dag,
)

histogram_percentiles = bigquery_etl_query(
    task_id="histogram_percentiles",
    destination_table="histogram_percentiles_v1",
    dataset_id=dataset_id,
    project_id=project_id,
    owner="*****@*****.**",
    date_partition_parameter=None,
    arguments=("--replace", ),
    dag=dag,
Esempio n. 26
0

def create_subdag_2(parent_dag_id, subdag_name, schedule_interval):
    with DAG(dag_id='{}.{}'.format(parent_dag_id, subdag_name),
             schedule_interval=schedule_interval,
             catchup=False,
             default_args=default_args) as subdag:
        task = BashOperator(task_id='task',
                            bash_command='echo "Sub-DAG 2 executed !!"')

        return subdag


with DAG(dag_id='08_subdags',
         schedule_interval='*/10 * * * *',
         catchup=False,
         default_args=default_args) as dag:
    sub_dag_1_name = 'sub_dag_1'
    sub_dag_1_task = SubDagOperator(subdag=create_subdag_1(
        dag.dag_id, sub_dag_1_name, dag.schedule_interval),
                                    task_id=sub_dag_1_name)

    foo = DummyOperator(task_id='foo')
    sub_dag_1_task >> foo

    sub_dag_2_name = 'sub_dag_2'
    sub_dag_2_task = SubDagOperator(subdag=create_subdag_2(
        dag.dag_id, sub_dag_2_name, dag.schedule_interval),
                                    task_id=sub_dag_2_name)
    foo >> sub_dag_2_task
Esempio n. 27
0
    'retry_delay':
    datetime.timedelta(minutes=10),
    'schedule_interval':
    '0 1 * * *',
}

dag_name = 'bq_events_to_amplitude'

with models.DAG(dag_name, default_args=default_args) as dag:

    fenix_task_id = 'fenix_amplitude_export'
    SubDagOperator(subdag=export_to_amplitude(
        dag_name=fenix_task_id,
        parent_dag_name=dag_name,
        default_args=default_args,
        project='moz-fx-data-derived-datasets',
        dataset='telemetry',
        table_or_view='fenix_events_v1',
        s3_prefix='fenix',
    ),
                   task_id=fenix_task_id)

    fennec_ios_task_id = 'fennec_ios_amplitude_export'
    fennec_ios_args = default_args.copy()
    fennec_ios_args["start_date"] = datetime.datetime(2019, 12, 2)
    SubDagOperator(subdag=export_to_amplitude(
        dag_name=fennec_ios_task_id,
        parent_dag_name=dag_name,
        default_args=fennec_ios_args,
        project='moz-fx-data-shared-prod',
        dataset='telemetry',
        table_or_view='fennec_ios_events_v1',
load_songplays_table = LoadFactOperator(
    task_id='Load_songplays_fact_table',
    dag=dag,
    provide_context=True,
    aws_credentials_id="aws_credentials",
    redshift_conn_id='redshift',
    sql_query=SqlQueries.songplay_table_insert)

load_user_dimension_table_task_id = 'Load_user_dim_table'
load_user_dimension_table = SubDagOperator(
    subdag=load_dimensional_tables_dag(
        parent_dag_name=dag_name,
        task_id=load_user_dimension_table_task_id,
        redshift_conn_id="redshift",
        aws_credentials_id="aws_credentials",
        start_date=datetime(2018, 5, 1),
        table="users",
        sql_query=SqlQueries.user_table_insert,
    ),
    task_id=load_user_dimension_table_task_id,
    dag=dag,
)

load_song_dimension_table_task_id = 'Load_song_dim_table'
load_song_dimension_table = SubDagOperator(
    subdag=load_dimensional_tables_dag(
        parent_dag_name=dag_name,
        task_id=load_song_dimension_table_task_id,
        redshift_conn_id="redshift",
        aws_credentials_id="aws_credentials",
        start_date=datetime(2018, 5, 1),
Esempio n. 29
0
crash_report_parquet = SubDagOperator(
    task_id="hardware_report",
    dag=dag,
    subdag=moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,
        dag_name="hardware_report",
        default_args=default_args,
        cluster_name=cluster_name,
        job_name="Firefox_Hardware_Report",
        python_driver_code=
        "gs://moz-fx-data-prod-airflow-dataproc-artifacts/jobs/hardware_report.py",
        init_actions_uris=[
            "gs://dataproc-initialization-actions/python/pip-install.sh"
        ],
        additional_metadata={
            'PIP_PACKAGES':
            "google-cloud-bigquery==1.21.0 python_moztelemetry==0.10.2 boto3==1.9.87 click==6.7 click_datetime==0.2 requests-toolbelt==0.8.0 requests==2.20.1 typing==3.6.4"
        },
        additional_properties={
            "spark:spark.jars":
            "gs://spark-lib/bigquery/spark-bigquery-latest.jar",
            "spark-env:AWS_ACCESS_KEY_ID": aws_access_key,
            "spark-env:AWS_SECRET_ACCESS_KEY": aws_secret_key
        },
        py_args=[
            "--start_date",
            DS_WEEKLY,
            "--bucket",
            "telemetry-public-analysis-2",
            "--spark-provider",
            "dataproc",
        ],
        idle_delete_ttl='14400',
        num_workers=15,
        worker_machine_type='n1-standard-4',
        gcp_conn_id=gcp_conn_id))
    "input_event_response_coalesced_ms_main_above_2500",
    "input_event_response_coalesced_ms_content_above_150",
    "input_event_response_coalesced_ms_content_above_250",
    "input_event_response_coalesced_ms_content_above_2500",
    "ghost_windows_main_above_1",
    "ghost_windows_content_above_1",
]

main_summary_export = SubDagOperator(subdag=export_to_parquet(
    table=
    "moz-fx-data-shared-prod:telemetry_derived.main_summary_v4${{ds_nodash}}",
    static_partitions=["submission_date_s3={{ds_nodash}}"],
    arguments=[
        "--partition-by=sample_id",
        "--replace='{{ds_nodash}}' AS submission_date",
        "--maps-from-entries",
    ] + main_summary_bigint_columns,
    parent_dag_name=dag.dag_id,
    dag_name="main_summary_export",
    default_args=default_args,
    num_workers=40),
                                     task_id="main_summary_export",
                                     executor=get_default_executor(),
                                     dag=dag)

clients_daily_export = SubDagOperator(
    subdag=export_to_parquet(
        table=
        "moz-fx-data-shared-prod:telemetry_derived.clients_daily_v6${{ds_nodash}}",
        static_partitions=["submission_date_s3={{ds_nodash}}"],
        arguments=[
            # restore legacy schema
Esempio n. 31
0
    instance_count=10,
    env=mozetl_envvar("churn", {
        "start_date": "{{ ds_nodash }}",
        "bucket": "{{ task.__class__.private_output_bucket }}"
    }),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="public",
    dag=dag)

churn_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="churn_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="churn",
        dataset_version="v3",
        date_submission_col="week_start",
        gke_cluster_name="bq-load-gke-1",
        ),
    task_id="churn_bigquery_load",
    dag=dag)

churn_v2 = MozDatabricksSubmitRunOperator(
    task_id="churn_v2",
    job_name="churn 7-day v2",
    execution_timeout=timedelta(hours=4),
    instance_count=5,
    env=mozetl_envvar("churn", {
        "start_date": "{{ ds_nodash }}",
        "bucket": "{{ task.__class__.private_output_bucket }}"
 modules_with_missing_symbols = SubDagOperator(
     task_id="modules_with_missing_symbols",
     dag=dag,
     subdag=moz_dataproc_pyspark_runner(
         parent_dag_name=dag.dag_id,
         image_version="1.5",
         dag_name="modules_with_missing_symbols",
         default_args=default_args,
         cluster_name="modules-with-missing-symbols-{{ ds }}",
         job_name="modules-with-missing-symbols",
         python_driver_code=
         "https://raw.githubusercontent.com/mozilla/python_mozetl/main/mozetl/symbolication/modules_with_missing_symbols.py",
         init_actions_uris=[
             "gs://dataproc-initialization-actions/python/pip-install.sh"
         ],
         additional_metadata={"PIP_PACKAGES": " ".join(PIP_PACKAGES)},
         additional_properties={
             "spark:spark.jars":
             "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar",
             "spark-env:AWS_ACCESS_KEY_ID": ses_access_key,
             "spark-env:AWS_SECRET_ACCESS_KEY": ses_secret_key,
         },
         py_args=[
             "--run-on-days",
             "0",  # run monday
             "--date",
             "{{ ds }}"
         ],
         idle_delete_ttl="14400",
         num_workers=2,
         worker_machine_type="n1-standard-4",
         gcp_conn_id=params.conn_id,
         service_account=params.client_email,
         storage_bucket=params.storage_bucket,
     ),
 )
dag = DAG(
    dag_id=DAG_NAME,
    default_args=args,
    schedule_interval="@once",
)

start = DummyOperator(
    task_id='start',
    default_args=args,
    dag=dag,
)

section_1 = SubDagOperator(
    task_id='section-1',
    subdag=subdag(DAG_NAME, 'section-1', args),
    default_args=args,
    dag=dag,
)

some_other_task = DummyOperator(
    task_id='some-other-task',
    default_args=args,
    dag=dag,
)

section_2 = SubDagOperator(
    task_id='section-2',
    subdag=subdag(DAG_NAME, 'section-2', args),
    default_args=args,
    dag=dag,
)
Esempio n. 34
0
    execution_timeout=timedelta(hours=10),
    instance_count=5,
    env = tbv_envvar("com.mozilla.telemetry.views.SyncView", {
        "from": "{{ ds_nodash }}",
        "to": "{{ ds_nodash }}",
        "bucket": "{{ task.__class__.private_output_bucket }}"}),
    uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

sync_view_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="sync_view_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="sync_summary",
        dataset_version="v2",
        gke_cluster_name="bq-load-gke-1",
        ),
    task_id="sync_view_bigquery_load",
    dag=dag)

sync_events_view = EMRSparkOperator(
    task_id="sync_events_view",
    job_name="Sync Events View",
    execution_timeout=timedelta(hours=10),
    instance_count=1,
    email=['*****@*****.**'],
    env = tbv_envvar("com.mozilla.telemetry.views.SyncEventView", {
        "from": "{{ ds_nodash }}",