コード例 #1
0
def repeated_subdag(
    parent_dag_name,
    child_dag_name,
    default_args,
    schedule_interval,
    dataset_id,
    additional_params=None,
    num_partitions=5,
    date_partition_parameter="submission_date",
):
    dag = DAG(
        "%s.%s" % (parent_dag_name, child_dag_name),
        default_args=default_args,
        schedule_interval=schedule_interval,
    )

    # This task runs first and replaces the relevant partition, followed
    # by the next tasks that append to the same partition of the same table.
    NUM_SAMPLE_IDS = 100
    PARTITION_SIZE = NUM_SAMPLE_IDS / num_partitions
    task_0 = bigquery_etl_query(
        task_id="{dag_name}_0".format(dag_name=child_dag_name),
        destination_table="{dag_name}_v1".format(dag_name=child_dag_name),
        dataset_id=dataset_id,
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        depends_on_past=True,
        parameters=merge_params(0, PARTITION_SIZE - 1, additional_params),
        date_partition_parameter=date_partition_parameter,
        arguments=("--replace", ),
        dag=dag,
    )

    for partition in range(1, num_partitions):
        min_param = partition * PARTITION_SIZE
        max_param = min_param + PARTITION_SIZE - 1

        task = bigquery_etl_query(
            task_id="{}_{}".format(child_dag_name, partition),
            destination_table="{dag_name}_v1".format(dag_name=child_dag_name),
            dataset_id=dataset_id,
            project_id="moz-fx-data-shared-prod",
            owner="*****@*****.**",
            email=["*****@*****.**", "*****@*****.**"],
            depends_on_past=True,
            parameters=merge_params(min_param, max_param, additional_params),
            date_partition_parameter=date_partition_parameter,
            arguments=(
                "--append_table",
                "--noreplace",
            ),
            dag=dag,
        )
        task_0 >> task

    return dag
コード例 #2
0
def repeated_subdag(parent_dag_name, child_dag_name, default_args,
                    schedule_interval, dataset_id):
    dag = DAG(
        "%s.%s" % (parent_dag_name, child_dag_name),
        default_args=default_args,
        schedule_interval=schedule_interval,
    )

    NUM_PARTITIONS = 4
    NUM_SAMPLE_IDS = 100
    PARTITION_SIZE = NUM_SAMPLE_IDS / NUM_PARTITIONS
    task_0 = bigquery_etl_query(
        task_id="{dag_name}_0".format(dag_name=child_dag_name),
        destination_table="{dag_name}_v1".format(dag_name=child_dag_name),
        dataset_id=dataset_id,
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        depends_on_past=True,
        date_partition_parameter=None,
        parameters=(
            "min_sample_id:INT64:0",
            "max_sample_id:INT64:{}".format(PARTITION_SIZE - 1),
        ),
        arguments=("--replace", ),
        dag=dag,
    )

    for partition in range(1, NUM_PARTITIONS):
        min_param = partition * PARTITION_SIZE
        max_param = min_param + PARTITION_SIZE - 1

        task = bigquery_etl_query(
            task_id="{}_{}".format(child_dag_name, partition),
            destination_table="{dag_name}_v1".format(dag_name=child_dag_name),
            dataset_id=dataset_id,
            project_id="moz-fx-data-shared-prod",
            owner="*****@*****.**",
            email=["*****@*****.**", "*****@*****.**"],
            depends_on_past=True,
            date_partition_parameter=None,
            parameters=(
                "min_sample_id:INT64:{}".format(min_param),
                "max_sample_id:INT64:{}".format(max_param),
            ),
            arguments=(
                "--append_table",
                "--noreplace",
            ),
            dag=dag,
        )
        task_0 >> task

    return dag
コード例 #3
0
ファイル: extract.py プロジェクト: willkg/telemetry-airflow
def extract_channel_subdag(
    parent_dag_name,
    child_dag_name,
    default_args,
    schedule_interval,
    dataset_id,
    channel,
):
    dag = DAG(
        dag_id="{}.{}".format(parent_dag_name, child_dag_name),
        default_args=default_args,
        schedule_interval=schedule_interval,
    )

    bq_extract_table = "glam_extract_firefox_{}_v1".format(channel)
    etl_query = bigquery_etl_query(
        task_id="glam_client_probe_counts_{}_extract".format(channel),
        destination_table=bq_extract_table,
        dataset_id=dataset_id,
        project_id=project_id,
        owner="*****@*****.**",
        email=[
            "*****@*****.**",
            "*****@*****.**",
            "*****@*****.**",
        ],
        date_partition_parameter=None,
        arguments=("--replace", ),
        sql_file_path=
        "sql/moz-fx-data-shared-prod/{}/glam_client_probe_counts_extract_v1/query.sql"
        .format(dataset_id),
        parameters=("channel:STRING:{}".format(channel), ),
        dag=dag,
    )

    gcs_delete = GoogleCloudStorageDeleteOperator(
        task_id="glam_gcs_delete_old_{}_extracts".format(channel),
        bucket_name=glam_bucket,
        prefix="aggs-desktop-{}".format(channel),
        google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
        dag=dag,
    )

    gcs_destination = "gs://{bucket}/aggs-desktop-{channel}-*.csv".format(
        bucket=glam_bucket, channel=channel)
    bq2gcs = BigQueryToCloudStorageOperator(
        task_id="glam_extract_{}_to_csv".format(channel),
        source_project_dataset_table="{}.{}.{}".format(project_id, dataset_id,
                                                       bq_extract_table),
        destination_cloud_storage_uris=gcs_destination,
        bigquery_conn_id=gcp_conn.gcp_conn_id,
        export_format="CSV",
        print_header=False,
        dag=dag,
    )

    etl_query >> gcs_delete >> bq2gcs

    return dag
コード例 #4
0
def extract_channel_subdag(
    parent_dag_name,
    child_dag_name,
    default_args,
    schedule_interval,
    dataset_id,
    channel,
):
    dag = DAG(
        dag_id="{}.{}".format(parent_dag_name, child_dag_name),
        default_args=default_args,
        schedule_interval=schedule_interval,
    )

    bq_extract_table = "glam_client_probe_counts_{}_extract_v1".format(channel)
    glam_client_probe_counts_extract = bigquery_etl_query(
        task_id="glam_client_probe_counts_{}_extract".format(channel),
        destination_table=bq_extract_table,
        dataset_id=dataset_id,
        project_id=project_id,
        owner="*****@*****.**",
        email=[
            "*****@*****.**",
            "*****@*****.**",
            "*****@*****.**",
        ],
        date_partition_parameter=None,
        arguments=("--replace", ),
        dag=dag,
    )

    glam_gcs_delete_old_extracts = GoogleCloudStorageDeleteOperator(
        task_id="glam_gcs_delete_old_{}_extracts".format(channel),
        bucket_name=glam_bucket,
        prefix="extract-desktop-{}".format(channel),
        google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
        dag=dag,
    )

    gcs_destination = "gs://{}/extract-desktop-{}-*.csv".format(
        glam_bucket, channel)
    glam_extract_to_csv = BigQueryToCloudStorageOperator(
        task_id="glam_extract_{}_to_csv".format(channel),
        source_project_dataset_table="{}.{}.{}".format(project_id, dataset_id,
                                                       bq_extract_table),
        destination_cloud_storage_uris=gcs_destination,
        bigquery_conn_id=gcp_conn.gcp_conn_id,
        export_format="CSV",
        print_header=False,
        dag=dag,
    )

    glam_client_probe_counts_extract >> glam_gcs_delete_old_extracts >> glam_extract_to_csv

    return dag
コード例 #5
0
ファイル: extract.py プロジェクト: willkg/telemetry-airflow
def extract_user_counts(parent_dag_name, child_dag_name, default_args,
                        schedule_interval, dataset_id):

    dag = DAG(
        dag_id="{}.{}".format(parent_dag_name, child_dag_name),
        default_args=default_args,
        schedule_interval=schedule_interval,
    )

    bq_extract_table = "glam_user_counts_extract_v1"
    etl_query = bigquery_etl_query(
        task_id="glam_user_counts_extract",
        destination_table=bq_extract_table,
        dataset_id=dataset_id,
        project_id=project_id,
        owner="*****@*****.**",
        email=[
            "*****@*****.**",
            "*****@*****.**",
            "*****@*****.**",
        ],
        date_partition_parameter=None,
        arguments=("--replace", ),
        dag=dag,
    )

    gcs_delete = GoogleCloudStorageDeleteOperator(
        task_id="glam_gcs_delete_count_extracts",
        bucket_name=glam_bucket,
        prefix="glam-extract-firefox-counts",
        google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
        dag=dag,
    )

    gcs_destination = "gs://{}/glam-extract-firefox-counts.csv".format(
        glam_bucket)
    bq2gcs = BigQueryToCloudStorageOperator(
        task_id="glam_extract_user_counts_to_csv",
        source_project_dataset_table="{}.{}.{}".format(project_id, dataset_id,
                                                       bq_extract_table),
        destination_cloud_storage_uris=gcs_destination,
        bigquery_conn_id=gcp_conn.gcp_conn_id,
        export_format="CSV",
        print_header=False,
        dag=dag,
    )

    etl_query >> gcs_delete >> bq2gcs

    return dag
コード例 #6
0
def histogram_aggregates_subdag(parent_dag_name, child_dag_name, default_args,
                                schedule_interval, dataset_id):
    GLAM_HISTOGRAM_AGGREGATES_SUBDAG = "%s.%s" % (parent_dag_name,
                                                  child_dag_name)
    default_args["depends_on_past"] = True
    dag = DAG(
        GLAM_HISTOGRAM_AGGREGATES_SUBDAG,
        default_args=default_args,
        schedule_interval=schedule_interval,
    )

    clients_histogram_aggregates_new = bigquery_etl_query(
        task_id="clients_histogram_aggregates_new",
        destination_table="clients_histogram_aggregates_new_v1",
        dataset_id=dataset_id,
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter=None,
        parameters=("submission_date:DATE:{{ds}}", ),
        arguments=("--replace", ),
        dag=dag,
    )

    clients_histogram_aggregates_final = SubDagOperator(
        subdag=repeated_subdag(
            GLAM_HISTOGRAM_AGGREGATES_SUBDAG,
            GLAM_HISTOGRAM_AGGREGATES_FINAL_SUBDAG,
            default_args,
            dag.schedule_interval,
            dataset_id,
        ),
        task_id=GLAM_HISTOGRAM_AGGREGATES_FINAL_SUBDAG,
        executor=get_default_executor(),
        dag=dag,
    )

    clients_histogram_aggregates_new >> clients_histogram_aggregates_final
    return dag
コード例 #7
0
    # over all the tables in _live datasets into _stable datasets except those
    # that are specifically used in another DAG.
    copy_deduplicate_all = bigquery_etl_copy_deduplicate(
        task_id="copy_deduplicate_all",
        target_project_id="moz-fx-data-shared-prod",
        # Any table listed here under except_tables _must_ have a corresponding
        # copy_deduplicate job in another DAG.
        except_tables=["telemetry_live.main_v4"])

    # Events.

    event_events = bigquery_etl_query(
        task_id="event_events",
        project_id="moz-fx-data-shared-prod",
        destination_table="event_events_v1",
        dataset_id="telemetry_derived",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        arguments=('--schema_update_option=ALLOW_FIELD_ADDITION', ),
    )

    copy_deduplicate_all >> event_events

    # Experiment enrollment aggregates chain (depends on events)

    wait_for_main_events = ExternalTaskSensor(
        task_id="wait_for_main_events",
        external_dag_id="main_summary",
        external_task_id="bq_main_events",
        dag=dag)
コード例 #8
0
    "retry_delay": datetime.timedelta(seconds=1800),
    "email_on_failure": True,
    "email_on_retry": True,
    "retries": 2,
}

with DAG("bqetl_addons",
         default_args=default_args,
         schedule_interval="0 1 * * *") as dag:

    telemetry_derived__addons_daily__v1 = bigquery_etl_query(
        task_id="telemetry_derived__addons_daily__v1",
        destination_table="addons_daily_v1",
        dataset_id="telemetry_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    telemetry_derived__addons__v2 = bigquery_etl_query(
        task_id="telemetry_derived__addons__v2",
        destination_table="addons_v2",
        dataset_id="telemetry_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
コード例 #9
0
    "email_on_failure":
    True,
    "email_on_retry":
    True,
    "retries":
    1,
}

with DAG(
        "bqetl_error_aggregates",
        default_args=default_args,
        schedule_interval=datetime.timedelta(seconds=10800),
        doc_md=docs,
) as dag:

    telemetry_derived__error_aggregates__v1 = bigquery_etl_query(
        task_id="telemetry_derived__error_aggregates__v1",
        destination_table="error_aggregates_v1",
        dataset_id="telemetry_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=[
            "*****@*****.**",
            "*****@*****.**",
            "*****@*****.**",
        ],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )
コード例 #10
0
    "retry_delay": datetime.timedelta(seconds=1800),
    "email_on_failure": True,
    "email_on_retry": True,
    "retries": 2,
}

with DAG(
    "bqetl_search", default_args=default_args, schedule_interval="0 1 * * *"
) as dag:

    search_derived__search_metric_contribution__v1 = bigquery_etl_query(
        task_id="search_derived__search_metric_contribution__v1",
        destination_table="search_metric_contribution_v1",
        dataset_id="search_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    search_derived__search_aggregates__v8 = bigquery_etl_query(
        task_id="search_derived__search_aggregates__v8",
        destination_table="search_aggregates_v8",
        dataset_id="search_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
コード例 #11
0
with DAG('experiments_live',
         default_args=default_args,
         concurrency=4,
         max_active_runs=1,
         schedule_interval="*/5 * * * *") as dag:

    docker_image = "mozilla/bigquery-etl:latest"

    experiment_enrollment_aggregates_recents = bigquery_etl_query(
        task_id="experiment_enrollment_aggregates_recents",
        destination_table="experiment_enrollment_aggregates_recents_v1",
        dataset_id="telemetry_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter=None,
        depends_on_past=True,
        parameters=["submission_timestamp:TIMESTAMP:{{ts}}"],
        dag=dag,
        is_delete_operator_pod=True,
    )

    experiment_search_aggregates_recents = bigquery_etl_query(
        task_id="experiment_search_aggregates_recents",
        destination_table="experiment_search_aggregates_recents_v1",
        dataset_id="telemetry_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter=None,
コード例 #12
0
    "retries": 2,
}

with DAG(
        "bqetl_fenix_event_rollup",
        default_args=default_args,
        schedule_interval="0 2 * * *",
        doc_md=docs,
) as dag:

    fenix_derived__event_types__v1 = bigquery_etl_query(
        task_id="fenix_derived__event_types__v1",
        destination_table="event_types_v1",
        dataset_id="fenix_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter=None,
        depends_on_past=False,
        parameters=["submission_date:DATE:{{ds}}"],
        dag=dag,
    )

    fenix_derived__event_types_history__v1 = bigquery_etl_query(
        task_id="fenix_derived__event_types_history__v1",
        destination_table="event_types_history_v1",
        dataset_id="fenix_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=True,
コード例 #13
0
    # If a task fails, retry it once after waiting at least 5 minutes
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=10),
}

dag_name = 'fxa_events'

with models.DAG(
        dag_name,
        # Continue to run DAG once per day
        schedule_interval='0 10 * * *',
        default_args=default_args) as dag:

    fxa_auth_events = bigquery_etl_query(
        task_id='fxa_auth_events',
        destination_table='fxa_auth_events_v1',
        dataset_id='telemetry',
        arguments=('--schema_update_option=ALLOW_FIELD_ADDITION',),
    )

    fxa_auth_bounce_events = bigquery_etl_query(
        task_id='fxa_auth_bounce_events',
        destination_table='fxa_auth_bounce_events_v1',
        dataset_id='telemetry',
        arguments=('--schema_update_option=ALLOW_FIELD_ADDITION',),
    )

    fxa_content_events = bigquery_etl_query(
        task_id='fxa_content_events',
        destination_table='fxa_content_events_v1',
        dataset_id='telemetry',
        arguments=('--schema_update_option=ALLOW_FIELD_ADDITION',),
コード例 #14
0
    "email": ["*****@*****.**", "*****@*****.**"],
    "depends_on_past": False,
    "retry_delay": datetime.timedelta(seconds=300),
    "email_on_failure": True,
    "email_on_retry": True,
    "retries": 1,
}

with DAG("bqetl_core", default_args=default_args, schedule_interval="0 2 * * *") as dag:

    telemetry_derived__core_clients_daily__v1 = bigquery_etl_query(
        task_id="telemetry_derived__core_clients_daily__v1",
        destination_table="core_clients_daily_v1",
        dataset_id="telemetry_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        priority_weight=75,
        dag=dag,
    )

    telemetry_derived__core_clients_last_seen__v1 = bigquery_etl_query(
        task_id="telemetry_derived__core_clients_last_seen__v1",
        destination_table="core_clients_last_seen_v1",
        dataset_id="telemetry_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=True,
コード例 #15
0
    2,
}

with DAG("bqetl_main_summary",
         default_args=default_args,
         schedule_interval="0 2 * * *") as dag:

    firefox_desktop_exact_mau28_by_client_count_dimensions = bigquery_etl_query(
        task_id="firefox_desktop_exact_mau28_by_client_count_dimensions",
        destination_table=
        "firefox_desktop_exact_mau28_by_client_count_dimensions_v1",
        dataset_id="telemetry_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=[
            "*****@*****.**",
            "*****@*****.**",
            "*****@*****.**",
            "*****@*****.**",
        ],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    firefox_desktop_exact_mau28_by_dimensions = bigquery_etl_query(
        task_id="firefox_desktop_exact_mau28_by_dimensions",
        destination_table="firefox_desktop_exact_mau28_by_dimensions_v1",
        dataset_id="telemetry_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
コード例 #16
0
default_args = {
    "owner": "*****@*****.**",
    "start_date": datetime.datetime(2020, 6, 29, 0, 0),
    "end_date": None,
    "email": ["*****@*****.**", "*****@*****.**"],
    "depends_on_past": False,
    "retry_delay": datetime.timedelta(seconds=1800),
    "email_on_failure": True,
    "email_on_retry": True,
    "retries": 2,
}

with DAG(
    "bqetl_deletion_request_volume",
    default_args=default_args,
    schedule_interval="0 1 * * *",
) as dag:

    monitoring_derived__deletion_request_volume__v1 = bigquery_etl_query(
        task_id="monitoring_derived__deletion_request_volume__v1",
        destination_table="deletion_request_volume_v1",
        dataset_id="monitoring_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )
コード例 #17
0
    "retry_delay": datetime.timedelta(seconds=1800),
    "email_on_failure": True,
    "email_on_retry": True,
    "retries": 2,
}

with DAG(
    "bqetl_fenix_event_rollup", default_args=default_args, schedule_interval="0 2 * * *"
) as dag:

    org_mozilla_firefox_derived__event_types__v1 = bigquery_etl_query(
        task_id="org_mozilla_firefox_derived__event_types__v1",
        destination_table="event_types_v1",
        dataset_id="org_mozilla_firefox_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=True,
        dag=dag,
    )

    org_mozilla_firefox_derived__events_daily__v1 = bigquery_etl_query(
        task_id="org_mozilla_firefox_derived__events_daily__v1",
        destination_table="events_daily_v1",
        dataset_id="org_mozilla_firefox_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
コード例 #18
0
    "retries": 1,
}

with DAG(
        "bqetl_fxa_events",
        default_args=default_args,
        schedule_interval="30 1 * * *",
        doc_md=docs,
) as dag:

    firefox_accounts_derived__exact_mau28__v1 = bigquery_etl_query(
        task_id="firefox_accounts_derived__exact_mau28__v1",
        destination_table="exact_mau28_v1",
        dataset_id="firefox_accounts_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    firefox_accounts_derived__fxa_auth_bounce_events__v1 = bigquery_etl_query(
        task_id="firefox_accounts_derived__fxa_auth_bounce_events__v1",
        destination_table="fxa_auth_bounce_events_v1",
        dataset_id="firefox_accounts_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
コード例 #19
0
    "retry_delay": datetime.timedelta(seconds=1800),
    "email_on_failure": True,
    "email_on_retry": True,
    "retries": 2,
}

with DAG(
    "bqetl_asn_aggregates", default_args=default_args, schedule_interval="0 2 * * *"
) as dag:

    telemetry_derived__asn_aggregates__v1 = bigquery_etl_query(
        task_id="telemetry_derived__asn_aggregates__v1",
        destination_table="asn_aggregates_v1",
        dataset_id="telemetry_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        parameters=["n_clients:INT64:500"],
        dag=dag,
    )

    wait_for_bq_main_events = ExternalTaskSensor(
        task_id="wait_for_bq_main_events",
        external_dag_id="copy_deduplicate",
        external_task_id="bq_main_events",
        execution_delta=datetime.timedelta(seconds=3600),
        check_existence=True,
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
    )
コード例 #20
0
    "email_on_retry": True,
    "retries": 2,
}

with DAG(
        "bqetl_google_analytics_derived",
        default_args=default_args,
        schedule_interval="0 23 * * *",
) as dag:

    ga_derived__blogs_daily_summary__v1 = bigquery_etl_query(
        task_id="ga_derived__blogs_daily_summary__v1",
        destination_table="blogs_daily_summary_v1",
        dataset_id="ga_derived",
        project_id="moz-fx-data-marketing-prod",
        owner="*****@*****.**",
        email=["*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    ga_derived__blogs_empty_check__v1 = bigquery_etl_query(
        task_id="ga_derived__blogs_empty_check__v1",
        destination_table=None,
        dataset_id="ga_derived",
        project_id="moz-fx-data-marketing-prod",
        owner="*****@*****.**",
        email=["*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
コード例 #21
0
    "retries":
    1,
}

with DAG("bqetl_vrbrowser",
         default_args=default_args,
         schedule_interval="0 2 * * *") as dag:

    org_mozilla_vrbrowser_derived__baseline_daily__v1 = bigquery_etl_query(
        task_id="org_mozilla_vrbrowser_derived__baseline_daily__v1",
        destination_table="baseline_daily_v1",
        dataset_id="org_mozilla_vrbrowser_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=[
            "*****@*****.**",
            "*****@*****.**",
            "*****@*****.**",
        ],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    org_mozilla_vrbrowser_derived__metrics_daily__v1 = bigquery_etl_query(
        task_id="org_mozilla_vrbrowser_derived__metrics_daily__v1",
        destination_table="metrics_daily_v1",
        dataset_id="org_mozilla_vrbrowser_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=[
        dag_name,
        schedule_interval="0 2 * * *",
        default_args=default_args) as dag:

    wait_for_copy_deduplicate_main_ping = ExternalTaskSensor(
        task_id="wait_for_copy_deduplicate_main_ping",
        external_dag_id="copy_deduplicate",
        external_task_id="copy_deduplicate_main_ping",
        execution_delta=datetime.timedelta(hours=1),
        dag=dag,
    )

    fission_monitoring_main_v1 = bigquery_etl_query(
        task_id="fission_monitoring_main_v1",
        project_id="moz-fx-data-shared-prod",
        destination_table="fission_monitoring_main_v1",
        dataset_id="telemetry_derived",
        arguments=('--schema_update_option=ALLOW_FIELD_ADDITION',),
    )

    wait_for_copy_deduplicate_crash_ping = ExternalTaskSensor(
        task_id="wait_for_copy_deduplicate_crash_ping",
        external_dag_id="copy_deduplicate",
        external_task_id="copy_deduplicate_all",
        execution_delta=datetime.timedelta(hours=1),
        dag=dag,
    )

    fission_monitoring_crash_v1 = bigquery_etl_query(
        task_id="fission_monitoring_crash_v1",
        project_id="moz-fx-data-shared-prod",
コード例 #23
0
        command=[
            "python",
            "sql/moz-fx-data-shared-prod/monitoring_derived/column_size_v1/query.py",
        ]
        + ["--date", "{{ ds }}"],
        docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
        owner="*****@*****.**",
        email=["*****@*****.**"],
    )

    monitoring_derived__schema_error_counts__v2 = bigquery_etl_query(
        task_id="monitoring_derived__schema_error_counts__v2",
        destination_table="schema_error_counts_v2",
        dataset_id="monitoring_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    monitoring_derived__stable_table_sizes__v1 = gke_command(
        task_id="monitoring_derived__stable_table_sizes__v1",
        command=[
            "python",
            "sql/moz-fx-data-shared-prod/monitoring_derived/stable_table_sizes_v1/query.py",
        ]
        + ["--date", "{{ ds }}"],
        docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
        owner="*****@*****.**",
コード例 #24
0
    "retries": 2,
}

with DAG(
    "bqetl_org_mozilla_fenix_derived",
    default_args=default_args,
    schedule_interval="0 2 * * *",
    doc_md=docs,
) as dag:

    org_mozilla_fenix_derived__geckoview_version__v1 = bigquery_etl_query(
        task_id="org_mozilla_fenix_derived__geckoview_version__v1",
        destination_table="geckoview_version_v1",
        dataset_id="org_mozilla_fenix_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    wait_for_copy_deduplicate_all = ExternalTaskSensor(
        task_id="wait_for_copy_deduplicate_all",
        external_dag_id="copy_deduplicate",
        external_task_id="copy_deduplicate_all",
        execution_delta=datetime.timedelta(seconds=3600),
        check_existence=True,
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
    )
コード例 #25
0
    "retry_delay": datetime.timedelta(seconds=300),
    "email_on_failure": True,
    "email_on_retry": True,
    "retries": 1,
}

with DAG("bqetl_messaging_system",
         default_args=default_args,
         schedule_interval="0 2 * * *") as dag:

    messaging_system_derived__cfr_exact_mau28_by_dimensions__v1 = bigquery_etl_query(
        task_id="messaging_system_derived__cfr_exact_mau28_by_dimensions__v1",
        destination_table="cfr_exact_mau28_by_dimensions_v1",
        dataset_id="messaging_system_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    messaging_system_derived__cfr_users_daily__v1 = bigquery_etl_query(
        task_id="messaging_system_derived__cfr_users_daily__v1",
        destination_table="cfr_users_daily_v1",
        dataset_id="messaging_system_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
コード例 #26
0
    "retries": 2,
}

with DAG(
        "bqetl_experiments_daily",
        default_args=default_args,
        schedule_interval="0 3 * * *",
        doc_md=docs,
) as dag:

    experiment_enrollment_daily_active_population = bigquery_etl_query(
        task_id="experiment_enrollment_daily_active_population",
        destination_table="experiment_enrollment_daily_active_population_v1",
        dataset_id="telemetry_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter=None,
        depends_on_past=False,
        dag=dag,
    )

    monitoring__query_cost__v1 = bigquery_etl_query(
        task_id="monitoring__query_cost__v1",
        destination_table="query_cost_v1",
        dataset_id="monitoring",
        project_id="moz-fx-data-experiments",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
コード例 #27
0
    "retries": 2,
}

with DAG(
        "bqetl_internet_outages",
        default_args=default_args,
        schedule_interval="0 3 * * *",
        doc_md=docs,
) as dag:

    internet_outages__global_outages__v1 = bigquery_etl_query(
        task_id="internet_outages__global_outages__v1",
        destination_table="global_outages_v1",
        dataset_id="internet_outages",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    wait_for_copy_deduplicate_all = ExternalTaskSensor(
        task_id="wait_for_copy_deduplicate_all",
        external_dag_id="copy_deduplicate",
        external_task_id="copy_deduplicate_all",
        execution_delta=datetime.timedelta(seconds=7200),
        check_existence=True,
        mode="reschedule",
        pool="DATA_ENG_EXTERNALTASKSENSOR",
    )
コード例 #28
0
    "retry_delay": datetime.timedelta(seconds=1800),
    "email_on_failure": True,
    "email_on_retry": True,
    "retries": 2,
}

with DAG(
    "bqetl_amo_stats", default_args=default_args, schedule_interval="0 3 * * *"
) as dag:

    amo_dev__amo_stats_dau__v2 = bigquery_etl_query(
        task_id="amo_dev__amo_stats_dau__v2",
        destination_table="amo_stats_dau_v2",
        dataset_id="amo_dev",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    amo_dev__amo_stats_installs__v3 = bigquery_etl_query(
        task_id="amo_dev__amo_stats_installs__v3",
        destination_table="amo_stats_installs_v3",
        dataset_id="amo_dev",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
コード例 #29
0
with DAG(
        "bqetl_search",
        default_args=default_args,
        schedule_interval="0 3 * * *",
        doc_md=docs,
) as dag:

    search_derived__search_aggregates__v8 = bigquery_etl_query(
        task_id="search_derived__search_aggregates__v8",
        destination_table="search_aggregates_v8",
        dataset_id="search_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=[
            "*****@*****.**",
            "*****@*****.**",
            "*****@*****.**",
        ],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    search_derived__search_clients_daily__v8 = bigquery_etl_query(
        task_id="search_derived__search_clients_daily__v8",
        destination_table="search_clients_daily_v8",
        dataset_id="search_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
コード例 #30
0
    "retry_delay": datetime.timedelta(seconds=300),
    "email_on_failure": True,
    "email_on_retry": True,
    "retries": 1,
}

with DAG(
    "bqetl_gud", default_args=default_args, schedule_interval="0 3 * * *", doc_md=docs
) as dag:

    telemetry_derived__smoot_usage_desktop__v2 = bigquery_etl_query(
        task_id="telemetry_derived__smoot_usage_desktop__v2",
        destination_table="smoot_usage_desktop_v2",
        dataset_id="telemetry_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,
        dag=dag,
    )

    telemetry_derived__smoot_usage_desktop_compressed__v2 = bigquery_etl_query(
        task_id="telemetry_derived__smoot_usage_desktop_compressed__v2",
        destination_table="smoot_usage_desktop_compressed_v2",
        dataset_id="telemetry_derived",
        project_id="moz-fx-data-shared-prod",
        owner="*****@*****.**",
        email=["*****@*****.**", "*****@*****.**"],
        date_partition_parameter="submission_date",
        depends_on_past=False,