Example #1
0
        service_account=params.client_email,
        artifact_bucket=params.artifact_bucket,
        storage_bucket=params.storage_bucket,
        default_args=subdag_args,
    ),
)

bgbb_pred_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="bgbb_pred_bigquery_load",
        default_args=default_args,
        dataset="bgbb/active_profiles",
        dataset_version="v1",
        p2b_table_alias="active_profiles_v1",
        bigquery_dataset="telemetry_derived",
        ds_type="ds",
        gke_cluster_name="bq-load-gke-1",
        cluster_by=["sample_id"],
        rename={"submission_date_s3": "submission_date"},
        replace=["SAFE_CAST(sample_id AS INT64) AS sample_id"],
        env_vars={"GOOGLE_CLOUD_PROJECT":
                  "{{ var.value.gcp_shared_prod_project }}"},
        ),
    task_id="bgbb_pred_bigquery_load",
    dag=dag)

search_clients_daily_bigquery = bigquery_etl_query(
    task_id="search_clients_daily_bigquery",
    destination_table="search_clients_daily_v8",
    dataset_id="search_derived",
    project_id="moz-fx-data-shared-prod",
        task_name = dataset.replace('-', '_') + '_bigquery_load'

        kwargs = {
            'parent_dag_name': dag.dag_id,
            'dag_name': task_name,
            'default_args': default_args,
            'dataset_s3_bucket': 'net-mozaws-prod-us-west-2-pipeline-data',
            'aws_conn_id': 'aws_prod_iam_s3',
            'dataset': dataset,
            'gke_cluster_name': 'bq-load-gke-1',
            'bigquery_dataset': 'telemetry_derived',
        }

        kwargs.update(values)

        tasks[task_name] = SubDagOperator(subdag=load_to_bigquery(**kwargs),
                                          task_id=task_name)

    # Daily and last seen views on top of core pings.

    core_clients_daily = bigquery_etl_query(
        task_id='core_clients_daily',
        destination_table='core_clients_daily_v1',
        dataset_id='telemetry',
    )

    tasks['telemetry_core_parquet_bigquery_load'] >> core_clients_daily

    core_clients_last_seen = bigquery_etl_query(
        task_id='core_clients_last_seen',
        destination_table='core_clients_last_seen_raw_v1',
    ebs_volume_size=250,
    execution_timeout=timedelta(hours=4),
    env=tbv_envvar(
        "com.mozilla.telemetry.views.CrashSummaryView", {
            "from": "{{ ds_nodash }}",
            "to": "{{ ds_nodash }}",
            "outputBucket": "{{ task.__class__.private_output_bucket }}"
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

crash_summary_view_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="crash_summary_view_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        ds_type="ds",
        aws_conn_id="aws_dev_iam_s3",
        dataset="crash_summary",
        dataset_version="v2",
        date_submission_col="submission_date",
        gke_cluster_name="bq-load-gke-1",
        bigquery_dataset="telemetry_derived",
    ),
    task_id="crash_summary_view_bigquery_load",
    dag=dag)

crash_summary_view >> crash_summary_view_bigquery_load
register_status(main_summary, "Main Summary", "A summary view of main pings.")

main_summary_schema = EmailSchemaChangeOperator(
    task_id="main_summary_schema",
    email=["*****@*****.**", "*****@*****.**"],
    to=["*****@*****.**", "*****@*****.**"],
    key_prefix='schema/main_summary/submission_date_s3=',
    dag=dag)

main_summary_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="main_summary_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="main_summary",
        dataset_version="v4",
        gke_cluster_name="bq-load-gke-1",
        ),
    task_id="main_summary_bigquery_load",
    dag=dag)

engagement_ratio = EMRSparkOperator(
    task_id="engagement_ratio",
    job_name="Update Engagement Ratio",
    execution_timeout=timedelta(hours=6),
    instance_count=10,
    env=mozetl_envvar("engagement_ratio",
        options={
            "input_bucket": "{{ task.__class__.private_output_bucket }}",
    env={"date": "{{ ds_nodash }}"},
    uri=
    "https://raw.githubusercontent.com/mozilla-services/data-pipeline/master/reports/socorro_import/ImportCrashData.ipynb",
    output_visibility="public",
    dag=dag,
)

register_status(
    crash_report_parquet,
    crash_report_parquet.job_name,
    "Convert processed crash reports into parquet for analysis",
)

crash_report_parquet_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="crash_report_parquet_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="socorro_crash",
        dataset_version="v2",
        date_submission_col="crash_date",
        gke_cluster_name="bq-load-gke-1",
        bigquery_dataset="telemetry_raw",
    ),
    task_id="crash_report_parquet_bigquery_load",
    dag=dag)

crash_report_parquet >> crash_report_parquet_bigquery_load
    execution_timeout=timedelta(hours=4),
    instance_count=10,
    env={"date": "{{ ds_nodash }}"},
    uri="https://raw.githubusercontent.com/mozilla-services/data-pipeline/master/reports/socorro_import/ImportCrashData.ipynb",
    output_visibility="public",
    dag=dag,
)

register_status(
    crash_report_parquet,
    crash_report_parquet.job_name,
    "Convert processed crash reports into parquet for analysis",
)

crash_report_parquet_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="crash_report_parquet_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="socorro_crash",
        dataset_version="v2",
        date_submission_col="crash_date",
        gke_cluster_name="bq-load-gke-1",
        ),
    task_id="crash_report_parquet_bigquery_load",
    dag=dag)

crash_report_parquet >> crash_report_parquet_bigquery_load
        job_name="main_summary_view_{{ds_nodash}}",
        init_actions_uris=[],
        gcp_conn_id="google_cloud_airflow_dataproc",
    ),
    task_id="main_summary_dataproc",
    dag=dag,
)

main_summary_dataproc_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="main_summary_dataproc_bigquery_load",
        default_args=default_args,
        dataset_gcs_bucket=main_summary_dataproc_bucket.replace("gs://", ""),
        dataset="main_summary",
        dataset_version="v4",
        gke_cluster_name="bq-load-gke-1",
        bigquery_dataset="backfill",
        cluster_by=["sample_id"],
        drop=["submission_date"],
        rename={"submission_date_s3": "submission_date"},
        replace=["SAFE_CAST(sample_id AS INT64) AS sample_id"],
    ),
    task_id="main_summary_dataproc_bigquery_load",
    dag=dag,
)

main_ping_bigquery_export_delete = gke_command(
    task_id="main_ping_bigquery_export_delete",
    command=[
        "gsutil",
        "-m",
Example #8
0
        ],
        gcp_conn_id=params.conn_id,
        service_account=params.client_email,
        artifact_bucket=params.artifact_bucket,
        storage_bucket=params.storage_bucket,
        default_args=subdag_args,
    ),
)

bgbb_pred_bigquery_load = SubDagOperator(subdag=load_to_bigquery(
    parent_dag_name=dag.dag_id,
    dag_name="bgbb_pred_bigquery_load",
    default_args=default_args,
    dataset="bgbb/active_profiles",
    dataset_version="v1",
    p2b_table_alias="active_profiles_v1",
    bigquery_dataset="telemetry_derived",
    ds_type="ds",
    gke_cluster_name="bq-load-gke-1",
    cluster_by=["sample_id"],
    rename={"submission_date_s3": "submission_date"},
    replace=["SAFE_CAST(sample_id AS INT64) AS sample_id"],
),
                                         task_id="bgbb_pred_bigquery_load",
                                         dag=dag)

search_clients_daily_bigquery = bigquery_etl_query(
    task_id="search_clients_daily_bigquery",
    destination_table="search_clients_daily_v8",
    dataset_id="search_derived",
    project_id="moz-fx-data-shared-prod",
    owner="*****@*****.**",
Example #9
0
    instance_count=10,
    env=mozetl_envvar("churn", {
        "start_date": "{{ ds_nodash }}",
        "bucket": "{{ task.__class__.private_output_bucket }}"
    }),
    uri="https://raw.githubusercontent.com/mozilla/python_mozetl/master/bin/mozetl-submit.sh",
    output_visibility="public",
    dag=dag)

churn_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="churn_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="churn",
        dataset_version="v3",
        date_submission_col="week_start",
        gke_cluster_name="bq-load-gke-1",
        ),
    task_id="churn_bigquery_load",
    dag=dag)

churn_v2 = MozDatabricksSubmitRunOperator(
    task_id="churn_v2",
    job_name="churn 7-day v2",
    execution_timeout=timedelta(hours=4),
    instance_count=5,
    env=mozetl_envvar("churn", {
        "start_date": "{{ ds_nodash }}",
Example #10
0
    env=tbv_envvar(
        "com.mozilla.telemetry.views.SyncView", {
            "from": "{{ ds_nodash }}",
            "to": "{{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}"
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

sync_view_bigquery_load = SubDagOperator(subdag=load_to_bigquery(
    parent_dag_name=dag.dag_id,
    dag_name="sync_view_bigquery_load",
    default_args=default_args,
    dataset_s3_bucket="telemetry-parquet",
    aws_conn_id="aws_dev_iam_s3",
    dataset="sync_summary",
    dataset_version="v2",
    gke_cluster_name="bq-load-gke-1",
    bigquery_dataset="telemetry_derived",
),
                                         task_id="sync_view_bigquery_load",
                                         dag=dag)

sync_events_view = EMRSparkOperator(
    task_id="sync_events_view",
    job_name="Sync Events View",
    execution_timeout=timedelta(hours=10),
    instance_count=1,
    email=['*****@*****.**'],
    env=tbv_envvar(
            "to": "{{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}",
            "doc-type": "first_shutdown",
            "read-mode": "aligned",
            "input-partition-multiplier": "4"
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

first_shutdown_summary_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="first_shutdown_summary_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="first_shutdown_summary",
        dataset_version="v4",
        gke_cluster_name="bq-load-gke-1",
        bigquery_dataset="telemetry_derived",
        cluster_by=["sample_id"],
        drop=["submission_date"],
        rename={"submission_date_s3": "submission_date"},
        replace=["SAFE_CAST(sample_id AS INT64) AS sample_id"],
    ),
    task_id="first_shutdown_summary_bigquery_load",
    dag=dag)

first_shutdown_summary >> first_shutdown_summary_bigquery_load
register_status(main_summary, "Main Summary", "A summary view of main pings.")

main_summary_schema = EmailSchemaChangeOperator(
    task_id="main_summary_schema",
    email=["*****@*****.**", "*****@*****.**"],
    to=["*****@*****.**", "*****@*****.**"],
    key_prefix='schema/main_summary/submission_date_s3=',
    dag=dag)

main_summary_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="main_summary_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="main_summary",
        dataset_version="v4",
        gke_cluster_name="bq-load-gke-1",
        ),
    task_id="main_summary_bigquery_load",
    dag=dag)

engagement_ratio = EMRSparkOperator(
    task_id="engagement_ratio",
    job_name="Update Engagement Ratio",
    execution_timeout=timedelta(hours=6),
    instance_count=10,
    env=mozetl_envvar("engagement_ratio",
        options={
            "input_bucket": "{{ task.__class__.private_output_bucket }}",
Example #13
0
    job_name="Sync Pings View",
    execution_timeout=timedelta(hours=10),
    instance_count=5,
    env = tbv_envvar("com.mozilla.telemetry.views.SyncView", {
        "from": "{{ ds_nodash }}",
        "to": "{{ ds_nodash }}",
        "bucket": "{{ task.__class__.private_output_bucket }}"}),
    uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

sync_view_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="sync_view_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="sync_summary",
        dataset_version="v2",
        gke_cluster_name="bq-load-gke-1",
        ),
    task_id="sync_view_bigquery_load",
    dag=dag)

sync_events_view = EMRSparkOperator(
    task_id="sync_events_view",
    job_name="Sync Events View",
    execution_timeout=timedelta(hours=10),
    instance_count=1,
    email=['*****@*****.**'],
    env = tbv_envvar("com.mozilla.telemetry.views.SyncEventView", {
        "from": "{{ ds_nodash }}",
    task_id="first_shutdown_summary",
    job_name="First Shutdown Summary View",
    execution_timeout=timedelta(hours=1),
    instance_count=1,
    env=tbv_envvar("com.mozilla.telemetry.views.MainSummaryView", {
        "from": "{{ ds_nodash }}",
        "to": "{{ ds_nodash }}",
        "bucket": "{{ task.__class__.private_output_bucket }}",
        "doc-type": "first_shutdown",
        "read-mode": "aligned",
        "input-partition-multiplier": "4"
    }),
    uri="https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

first_shutdown_summary_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="first_shutdown_summary_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="first_shutdown_summary",
        dataset_version="v4",
        gke_cluster_name="bq-load-gke-1",
        ),
    task_id="first_shutdown_summary_bigquery_load",
    dag=dag)

first_shutdown_summary >> first_shutdown_summary_bigquery_load
    execution_timeout=timedelta(hours=1),
    instance_count=1,
    env=tbv_envvar(
        "com.mozilla.telemetry.views.MainSummaryView", {
            "from": "{{ ds_nodash }}",
            "to": "{{ ds_nodash }}",
            "bucket": "{{ task.__class__.private_output_bucket }}",
            "doc-type": "first_shutdown",
            "read-mode": "aligned",
            "input-partition-multiplier": "4"
        }),
    uri=
    "https://raw.githubusercontent.com/mozilla/telemetry-airflow/master/jobs/telemetry_batch_view.py",
    dag=dag)

first_shutdown_summary_bigquery_load = SubDagOperator(
    subdag=load_to_bigquery(
        parent_dag_name=dag.dag_id,
        dag_name="first_shutdown_summary_bigquery_load",
        default_args=default_args,
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="first_shutdown_summary",
        dataset_version="v4",
        gke_cluster_name="bq-load-gke-1",
    ),
    task_id="first_shutdown_summary_bigquery_load",
    dag=dag)

first_shutdown_summary >> first_shutdown_summary_bigquery_load
        kwargs = {
            'parent_dag_name': dag.dag_id,
            'dag_name': task_name,
            'default_args': default_args,
            'dataset_s3_bucket': 'net-mozaws-prod-us-west-2-pipeline-data',
            'aws_conn_id': 'aws_prod_iam_s3',
            'dataset': dataset,
            'gke_cluster_name': 'bq-load-gke-1',
            'bigquery_dataset': 'telemetry_raw',
        }

        kwargs.update(values)

        tasks[task_name] = SubDagOperator(
                            subdag=load_to_bigquery(**kwargs),
                            task_id=task_name)


    # Daily and last seen views on top of core pings.

    core_clients_daily = bigquery_etl_query(
        task_id='core_clients_daily',
        destination_table='core_clients_daily_v1',
    )

    tasks['telemetry_core_parquet_bigquery_load'] >> core_clients_daily

    core_clients_last_seen = bigquery_etl_query(
        task_id='core_clients_last_seen',
        destination_table='core_clients_last_seen_raw_v1',