コード例 #1
0
    def test_delete_objects(self, mock_hook):
        operator = GoogleCloudStorageDeleteOperator(task_id=TASK_ID,
                                                    bucket_name=TEST_BUCKET,
                                                    objects=MOCK_FILES[0:2])

        operator.execute(None)
        mock_hook.return_value.list.assert_not_called()
        mock_hook.return_value.delete.assert_has_calls(calls=[
            mock.call(bucket_name=TEST_BUCKET, object_name=MOCK_FILES[0]),
            mock.call(bucket_name=TEST_BUCKET, object_name=MOCK_FILES[1])
        ],
                                                       any_order=True)
コード例 #2
0
    def test_delete_prefix(self, mock_hook):
        mock_hook.return_value.list.return_value = MOCK_FILES[1:3]
        operator = GoogleCloudStorageDeleteOperator(task_id=TASK_ID,
                                                    bucket_name=TEST_BUCKET,
                                                    prefix=PREFIX)

        operator.execute(None)
        mock_hook.return_value.list.assert_called_once_with(
            bucket_name=TEST_BUCKET, prefix=PREFIX)
        mock_hook.return_value.delete.assert_has_calls(calls=[
            mock.call(bucket_name=TEST_BUCKET, object_name=MOCK_FILES[1]),
            mock.call(bucket_name=TEST_BUCKET, object_name=MOCK_FILES[2])
        ],
                                                       any_order=True)
コード例 #3
0
ファイル: extract.py プロジェクト: willkg/telemetry-airflow
def extract_channel_subdag(
    parent_dag_name,
    child_dag_name,
    default_args,
    schedule_interval,
    dataset_id,
    channel,
):
    dag = DAG(
        dag_id="{}.{}".format(parent_dag_name, child_dag_name),
        default_args=default_args,
        schedule_interval=schedule_interval,
    )

    bq_extract_table = "glam_extract_firefox_{}_v1".format(channel)
    etl_query = bigquery_etl_query(
        task_id="glam_client_probe_counts_{}_extract".format(channel),
        destination_table=bq_extract_table,
        dataset_id=dataset_id,
        project_id=project_id,
        owner="*****@*****.**",
        email=[
            "*****@*****.**",
            "*****@*****.**",
            "*****@*****.**",
        ],
        date_partition_parameter=None,
        arguments=("--replace", ),
        sql_file_path=
        "sql/moz-fx-data-shared-prod/{}/glam_client_probe_counts_extract_v1/query.sql"
        .format(dataset_id),
        parameters=("channel:STRING:{}".format(channel), ),
        dag=dag,
    )

    gcs_delete = GoogleCloudStorageDeleteOperator(
        task_id="glam_gcs_delete_old_{}_extracts".format(channel),
        bucket_name=glam_bucket,
        prefix="aggs-desktop-{}".format(channel),
        google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
        dag=dag,
    )

    gcs_destination = "gs://{bucket}/aggs-desktop-{channel}-*.csv".format(
        bucket=glam_bucket, channel=channel)
    bq2gcs = BigQueryToCloudStorageOperator(
        task_id="glam_extract_{}_to_csv".format(channel),
        source_project_dataset_table="{}.{}.{}".format(project_id, dataset_id,
                                                       bq_extract_table),
        destination_cloud_storage_uris=gcs_destination,
        bigquery_conn_id=gcp_conn.gcp_conn_id,
        export_format="CSV",
        print_header=False,
        dag=dag,
    )

    etl_query >> gcs_delete >> bq2gcs

    return dag
コード例 #4
0
def extract_channel_subdag(
    parent_dag_name,
    child_dag_name,
    default_args,
    schedule_interval,
    dataset_id,
    channel,
):
    dag = DAG(
        dag_id="{}.{}".format(parent_dag_name, child_dag_name),
        default_args=default_args,
        schedule_interval=schedule_interval,
    )

    bq_extract_table = "glam_client_probe_counts_{}_extract_v1".format(channel)
    glam_client_probe_counts_extract = bigquery_etl_query(
        task_id="glam_client_probe_counts_{}_extract".format(channel),
        destination_table=bq_extract_table,
        dataset_id=dataset_id,
        project_id=project_id,
        owner="*****@*****.**",
        email=[
            "*****@*****.**",
            "*****@*****.**",
            "*****@*****.**",
        ],
        date_partition_parameter=None,
        arguments=("--replace", ),
        dag=dag,
    )

    glam_gcs_delete_old_extracts = GoogleCloudStorageDeleteOperator(
        task_id="glam_gcs_delete_old_{}_extracts".format(channel),
        bucket_name=glam_bucket,
        prefix="extract-desktop-{}".format(channel),
        google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
        dag=dag,
    )

    gcs_destination = "gs://{}/extract-desktop-{}-*.csv".format(
        glam_bucket, channel)
    glam_extract_to_csv = BigQueryToCloudStorageOperator(
        task_id="glam_extract_{}_to_csv".format(channel),
        source_project_dataset_table="{}.{}.{}".format(project_id, dataset_id,
                                                       bq_extract_table),
        destination_cloud_storage_uris=gcs_destination,
        bigquery_conn_id=gcp_conn.gcp_conn_id,
        export_format="CSV",
        print_header=False,
        dag=dag,
    )

    glam_client_probe_counts_extract >> glam_gcs_delete_old_extracts >> glam_extract_to_csv

    return dag
コード例 #5
0
ファイル: extract.py プロジェクト: willkg/telemetry-airflow
def extract_user_counts(parent_dag_name, child_dag_name, default_args,
                        schedule_interval, dataset_id):

    dag = DAG(
        dag_id="{}.{}".format(parent_dag_name, child_dag_name),
        default_args=default_args,
        schedule_interval=schedule_interval,
    )

    bq_extract_table = "glam_user_counts_extract_v1"
    etl_query = bigquery_etl_query(
        task_id="glam_user_counts_extract",
        destination_table=bq_extract_table,
        dataset_id=dataset_id,
        project_id=project_id,
        owner="*****@*****.**",
        email=[
            "*****@*****.**",
            "*****@*****.**",
            "*****@*****.**",
        ],
        date_partition_parameter=None,
        arguments=("--replace", ),
        dag=dag,
    )

    gcs_delete = GoogleCloudStorageDeleteOperator(
        task_id="glam_gcs_delete_count_extracts",
        bucket_name=glam_bucket,
        prefix="glam-extract-firefox-counts",
        google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
        dag=dag,
    )

    gcs_destination = "gs://{}/glam-extract-firefox-counts.csv".format(
        glam_bucket)
    bq2gcs = BigQueryToCloudStorageOperator(
        task_id="glam_extract_user_counts_to_csv",
        source_project_dataset_table="{}.{}.{}".format(project_id, dataset_id,
                                                       bq_extract_table),
        destination_cloud_storage_uris=gcs_destination,
        bigquery_conn_id=gcp_conn.gcp_conn_id,
        export_format="CSV",
        print_header=False,
        dag=dag,
    )

    etl_query >> gcs_delete >> bq2gcs

    return dag
コード例 #6
0
        task_id="export_main_avro",
        cmds=["bash"],
        command=[
            "bin/export-avro.sh",
            "moz-fx-data-shared-prod",
            "moz-fx-data-shared-prod:analysis",
            "gs://moz-fx-data-derived-datasets-parquet-tmp/avro/mozaggregator/prerelease",
            "main_v4",
            "'nightly', 'beta'",
            "{{ ds }}",
        ],
        docker_image="mozilla/python_mozaggregator:latest",
        dag=dag,
    ).set_downstream(prerelease_telemetry_aggregate_view_dataproc)

    # Delete the GCS data
    GoogleCloudStorageDeleteOperator(
        task_id="delete_main_avro",
        bucket_name="moz-fx-data-derived-datasets-parquet-tmp",
        prefix=
        "avro/mozaggregator/prerelease/moz-fx-data-shared-prod/{{ ds_nodash }}/main_v4",
        google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
        dag=dag,
    ).set_upstream(prerelease_telemetry_aggregate_view_dataproc)

# copy over artifacts if we're running in dev
if is_dev:
    copy_to_dev = copy_artifacts_dev(dag, project_id, artifact_bucket,
                                     storage_bucket)
    copy_to_dev.set_downstream(prerelease_telemetry_aggregate_view_dataproc)
コード例 #7
0
)

export_csv = gke_command(
    task_id="export_csv",
    cmds=["bash"],
    env_vars={"DATASET": "glam_etl"},
    command=["script/glam/export_csv"],
    docker_image="mozilla/bigquery-etl:latest",
    gcp_conn_id="google_cloud_derived_datasets",
    dag=dag,
)

gcs_delete = GoogleCloudStorageDeleteOperator(
    task_id="gcs_delete",
    bucket_name=glam_bucket,
    prefix="glam-extract-fenix",
    gcp_conn_id="google_cloud_airflow_dataproc",
    dag=dag,
)

gcs_copy = GoogleCloudStorageToGoogleCloudStorageOperator(
    task_id="gcs_copy",
    source_bucket="glam-fenix-dev",
    source_object="*.csv",
    destination_bucket=glam_bucket,
    gcp_conn_id="google_cloud_airflow_dataproc",
    dag=dag,
)

wait_for_copy_deduplicate >> run_sql >> export_csv >> gcs_delete >> gcs_copy
コード例 #8
0
ファイル: example_gcs.py プロジェクト: wangccia/airflow
        object_name=BUCKET_FILE_LOCATION,
        entity=GCS_ACL_ENTITY,
        role=GCS_ACL_OBJECT_ROLE,
        task_id="gcs_object_create_acl_entry_task",
    )
    # [END howto_operator_gcs_object_create_acl_entry_task]

    download_file = GoogleCloudStorageDownloadOperator(
        task_id="download_file",
        object_name=BUCKET_FILE_LOCATION,
        bucket=BUCKET_1,
        filename=PATH_TO_SAVED_FILE,
    )

    copy_file = GoogleCloudStorageToGoogleCloudStorageOperator(
        task_id="copy_file",
        source_bucket=BUCKET_1,
        source_object=BUCKET_FILE_LOCATION,
        destination_bucket=BUCKET_2,
        destination_object=BUCKET_FILE_LOCATION,
    )

    delete_files = GoogleCloudStorageDeleteOperator(task_id="delete_files",
                                                    bucket_name=BUCKET_1,
                                                    prefix="")

    [create_bucket1, create_bucket2] >> list_buckets >> list_buckets_result
    [create_bucket1, create_bucket2] >> upload_file
    upload_file >> [download_file, copy_file]
    upload_file >> gcs_bucket_create_acl_entry_task >> gcs_object_create_acl_entry_task >> delete_files
コード例 #9
0
ファイル: gcp.py プロジェクト: ncloudioj/telemetry-airflow
def export_to_parquet(
    table,
    destination_table=None,
    static_partitions=[],
    arguments=[],
    use_storage_api=False,
    dag_name="export_to_parquet",
    parent_dag_name=None,
    default_args=None,
    gcp_conn_id="google_cloud_derived_datasets",
    dataproc_zone="us-central1-a",
    dataproc_storage_bucket="moz-fx-data-derived-datasets-parquet",
    num_workers=2,
    num_preemptible_workers=0,
    gcs_output_bucket="moz-fx-data-derived-datasets-parquet",
):

    """ Export a BigQuery table to Parquet.

    https://github.com/mozilla/bigquery-etl/blob/master/script/pyspark/export_to_parquet.py

    :param str table:                             [Required] BigQuery table name
    :param Optional[str] destination_table:       Output table name, defaults to table,
                                                  will have r'_v[0-9]+$' replaced with
                                                  r'/v[0-9]+'
    :param List[str] arguments:                   Additional pyspark arguments
    :param bool use_storage_api:                  Whether to read from the BigQuery
                                                  Storage API or an AVRO export
    :param str dag_name:                          Name of DAG
    :param Optional[str] parent_dag_name:         Parent DAG name
    :param Optional[Dict[str, Any]] default_args: DAG configuration
    :param str gcp_conn_id:                       Airflow connection id for GCP access
    :param str dataproc_storage_bucket:           Dataproc staging GCS bucket
    :param str dataproc_zone:                     GCP zone to launch dataproc clusters
    :param int num_preemptible_workers:           Number of Dataproc preemptible workers

    :return: airflow.models.DAG
    """

    # remove the dataset prefix and partition suffix from table
    table_id = table.rsplit(".", 1)[-1]
    unqualified_table, _, partition_id = table_id.partition("$")
    # limit cluster name to 35 characters plus suffix of -export-YYYYMMDD (51 total)
    cluster_name = unqualified_table.replace("_", "-")
    if len(cluster_name) > 35:
        # preserve version when truncating cluster name to 42 characters
        prefix, version = re.match(r"(.*?)(-v[0-9]+)?$", cluster_name).groups("")
        cluster_name = prefix[:35 - len(version)] + version
    cluster_name += "-export-{{ ds_nodash }}"

    dag_prefix = parent_dag_name + "." if parent_dag_name else ""
    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    if destination_table is None:
        destination_table = unqualified_table
    # separate version using "/" instead of "_"
    export_prefix = re.sub(r"_(v[0-9]+)$", r"/\1", destination_table) + "/"
    if static_partitions:
        export_prefix += "/".join(static_partitions) + "/"
    avro_prefix = "avro/" + export_prefix
    if not static_partitions and partition_id:
        avro_prefix += "partition_id=" + partition_id + "/"
    avro_path = "gs://" + gcs_output_bucket + "/" + avro_prefix + "*.avro"

    with models.DAG(dag_id=dag_prefix + dag_name, default_args=default_args) as dag:

        create_dataproc_cluster = DataprocClusterCreateOperator(
            task_id="create_dataproc_cluster",
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            num_workers=num_workers,
            image_version="1.4",
            storage_bucket=dataproc_storage_bucket,
            zone=dataproc_zone,
            master_machine_type="n1-standard-8",
            worker_machine_type="n1-standard-8",
            num_preemptible_workers=num_preemptible_workers,
            init_actions_uris=[
                "gs://dataproc-initialization-actions/python/pip-install.sh",
            ],
            metadata={"PIP_PACKAGES": "google-cloud-bigquery==1.20.0"},
        )

        run_dataproc_pyspark = DataProcPySparkOperator(
            task_id="run_dataproc_pyspark",
            cluster_name=cluster_name,
            dataproc_pyspark_jars=[
                "gs://spark-lib/bigquery/spark-bigquery-latest.jar"
            ],
            dataproc_pyspark_properties={
                "spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.4",
            },
            main="https://raw.githubusercontent.com/mozilla/bigquery-etl/master"
            "/script/pyspark/export_to_parquet.py",
            arguments=[table]
            + [
                "--" + key + "=" + value
                for key, value in {
                    "avro-path": (not use_storage_api) and avro_path,
                    "destination": "gs://" + gcs_output_bucket,
                    "destination-table": destination_table,
                }.items()
                if value
            ]
            + (["--static-partitions"] if static_partitions else [])
            + [static_partitions]
            + arguments,
            gcp_conn_id=gcp_conn_id,
        )

        delete_dataproc_cluster = DataprocClusterDeleteOperator(
            task_id="delete_dataproc_cluster",
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
        )

        if not use_storage_api:
            avro_export = BigQueryToCloudStorageOperator(
                task_id="avro_export",
                source_project_dataset_table=table,
                destination_cloud_storage_uris=avro_path,
                compression=None,
                export_format="AVRO",
                bigquery_conn_id=gcp_conn_id,
            )
            avro_delete = GoogleCloudStorageDeleteOperator(
                task_id="avro_delete",
                bucket_name=gcs_output_bucket,
                prefix=avro_prefix,
                google_cloud_storage_conn_id=gcp_conn_id,
                trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
            )
            avro_export >> run_dataproc_pyspark >> avro_delete

        create_dataproc_cluster >> run_dataproc_pyspark >> delete_dataproc_cluster

        return dag
コード例 #10
0
def export_to_amplitude(
        parent_dag_name,
        dag_name,
        default_args,
        project,
        dataset,
        table_or_view,
        s3_prefix,
        gcs_bucket='moz-fx-data-derived-datasets-amplitude-export',
        gcp_conn_id='google_cloud_derived_datasets',
        amplitude_s3_conn='amplitude_s3_conn',
        amplitude_s3_bucket='com-amplitude-vacuum-mozilla-vacuum-wup'):
    """Export a bigquery table or view to Amplitude.

    This uses the BigQueryToCloudStorage operator to export the
    partition to GCS, then pushes that data to S3. It operates
    on a temporary table that is dropped after the job is finished.

    :param str parent_dag_name: Parent dag name
    :param str dag_name: This dag's name (appended to parent_dag_name)
    :param str default_args: DAG configuration
    :param str dataset: BigQuery project containing the table to be exported
    :param str dataset: BigQuery dataset
    :param str table_or_view: Table or view name
    :param str gcs_bucket: The bucket the data will be exported to
    :param str gcp_conn_id: GCP connection ID
    :param str amplitude_s3_conn: S3 connection ID
    :param str amplitude_s3_bucket: The bucket to export data to
    :param str s3_prefix: The prefix for the s3 objects
    """

    environment = environ['DEPLOY_ENVIRONMENT']
    _dag_name = '{}.{}'.format(parent_dag_name, dag_name)

    with models.DAG(_dag_name, default_args=default_args) as dag:
        # For now, we assume the view is already updated
        # See https://github.com/mozilla/bigquery-etl/issues/218

        exec_date = '{{ ds }}'

        # Check that we have data for this date
        check_sql = ('SELECT COUNT(*) '
                     'FROM `{}.{}.{}` '
                     'WHERE DATE(submission_timestamp) = "{}"').format(
                         project, dataset, table_or_view, exec_date)

        wait_for_data = BigQuerySQLSensorOperator(task_id='wait_for_data',
                                                  sql=check_sql,
                                                  bigquery_conn_id=gcp_conn_id,
                                                  use_legacy_sql=False)

        # Create the table with yesterday's data
        project_id = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id
        temp_table_name = table_or_view + '_{{ ds_nodash }}'
        fully_qualified_table_name = '{}.{}.{}'.format(project_id, dataset,
                                                       temp_table_name)

        sql = ('SELECT * EXCEPT (submission_timestamp) '
               'FROM `{}.{}.{}` '
               'WHERE DATE(submission_timestamp) = "{}"').format(
                   project, dataset, table_or_view, exec_date)

        create_table = BigQueryOperator(
            task_id='create_temporary_table',
            sql=sql,
            destination_dataset_table=fully_qualified_table_name,
            bigquery_conn_id=gcp_conn_id,
            use_legacy_sql=False)

        directory = '/'.join((environment, s3_prefix, '{{ ds_nodash }}'))
        extension = '.tsv.gz'

        # Export from bq to gcs
        # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/bigquery_to_gcs.py#L28 # noqa: E501
        gcs_uri = 'gs://{}/{}/*{}'.format(gcs_bucket, directory, extension)
        table_extract = BigQueryToCloudStorageOperator(
            task_id='bq_to_gcs',
            source_project_dataset_table=fully_qualified_table_name,
            destination_cloud_storage_uris=[gcs_uri],
            bigquery_conn_id=gcp_conn_id,
            compression='GZIP',
            export_format='CSV',
            field_delimiter='\t',
            print_header=True)

        # Push the data to S3
        # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/gcs_to_s3.py#L29 # noqa: E501
        s3_push = GoogleCloudStorageToS3Operator(
            task_id='gcs_to_s3',
            bucket=gcs_bucket,
            prefix=directory,
            delimiter=extension,
            google_cloud_storage_conn_id=gcp_conn_id,
            dest_aws_conn_id=amplitude_s3_conn,
            dest_s3_key='s3://{}/'.format(amplitude_s3_bucket),
            replace=True)

        # Drop the temporary table
        table_drop = BigQueryOperator(
            task_id='drop_temp_table',
            sql='DROP TABLE `{}`'.format(fully_qualified_table_name),
            bigquery_conn_id=gcp_conn_id,
            use_legacy_sql=False)

        # Delete the GCS data
        data_delete = GoogleCloudStorageDeleteOperator(
            task_id='delete_gcs_data',
            bucket_name=gcs_bucket,
            prefix=directory,
            google_cloud_storage_conn_id=gcp_conn_id)

        wait_for_data >> create_table >> table_extract >> s3_push
        s3_push >> table_drop
        s3_push >> data_delete

        return dag
コード例 #11
0
def subdag_currency_exchange_to_bigquery(parent_dag_name, child_dag_name,
                                         execution_date, flow_name,
                                         raw_data_filepath,
                                         destination_project_dataset_table,
                                         schema_fields, bigquery_table_path,
                                         final_bigquery_table, args):
    """
    Subdag which does the following:
    - Parse the raw CSV file to get the dimension currency data & write
    results to a CSV
    - Upload the CSV to GCS
    - Copy data from GCS to BQ
    - Delete file from GCS

    :param parent_dag_name: Main DAG name
    :param child_dag_name: Child DAG name
    :param execution_date: (str) - Airflow execution date
    :param flow_name: (str) - Type of flow to execute:
     - dimension_currency
     - exchange_rate_history
    :param raw_data_filepath: (str) - Raw CSV filepath on Local
    :param destination_project_dataset_table: (str) - BQ table name,
     - dataset.table
    :param schema_fields: (list) - BQ table schema
    :param bigquery_table_path: (str) - BQ table query path
    :param final_bigquery_table: (str) - BQ table name
    :param args: Airflow arguments
    :return: None

    Note:
        Modified date: 10-04-2021
        Author: TB
    """
    dag = DAG(
        f"{parent_dag_name}.{child_dag_name}",
        default_args=args,
        schedule_interval="@daily",
    )

    # create filename
    filename = f"{flow_name}_{execution_date}.csv"

    # 1. extract data from raw csv file & upload to GCS
    clean_data_to_gcs = FlowToGoogleCloudStorage(
        task_id="clean_data_to_gcs",
        flow_name=flow_name,
        raw_data_filepath=raw_data_filepath,
        clean_filepath=f"downloads/{filename}",
        google_cloud_storage_conn_id="airflow_gcp_connection",
        gcs_bucket="airflow_poc",
        gcs_filepath=f"{flow_name}.csv",
        dag=dag)

    # 2. copy file from gcs to bigquery
    gcs_to_bq = GoogleCloudStorageToBigQueryOperator(
        task_id="gcs_to_bq",
        bucket="airflow_poc",
        source_objects=[f"{flow_name}.csv"],
        destination_project_dataset_table=destination_project_dataset_table,
        schema_fields=schema_fields,
        write_disposition="WRITE_TRUNCATE",
        google_cloud_storage_conn_id="airflow_gcp_connection",
        bigquery_conn_id="airflow_gcp_connection",
        dag=dag)

    # 3. delete file from GCS
    delete_gcs_file = GoogleCloudStorageDeleteOperator(
        task_id="delete_gcs_file",
        bucket_name="airflow_poc",
        objects=[f"{flow_name}.csv"],
        google_cloud_storage_conn_id="airflow_gcp_connection",
        dag=dag)

    clean_data_to_gcs >> gcs_to_bq >> delete_gcs_file

    if bigquery_table_path and final_bigquery_table:
        with open(bigquery_table_path, "r") as q:
            data_query = q.read()

        # create table in bigquery using SQL query
        create_bigquery_table = BigQueryOperator(
            task_id=f"create_bigquery_table",
            sql=data_query,
            destination_dataset_table=final_bigquery_table,
            write_disposition="WRITE_TRUNCATE",
            bigquery_conn_id="airflow_gcp_connection",
            use_legacy_sql=False,
            create_disposition='CREATE_IF_NEEDED',
            time_partitioning=None,
            cluster_fields=None,
            location="EU",
            dag=dag)

        clean_data_to_gcs >> gcs_to_bq >> delete_gcs_file
        gcs_to_bq >> create_bigquery_table

    return dag
コード例 #12
0
            "bin/export-avro.sh",
            "moz-fx-data-shared-prod",
            "moz-fx-data-shared-prod:analysis",
            "gs://moz-fx-data-derived-datasets-parquet-tmp/avro/mozaggregator/mobile",
            "saved_session_v4",
            "'nightly', 'beta'",
            "{{ ds }}",
        ],
        docker_image="mozilla/python_mozaggregator:latest",
        dag=dag,
    ).set_downstream(mobile_aggregate_view_dataproc)

    GoogleCloudStorageDeleteOperator(
        task_id="delete_mobile_metrics_avro",
        bucket_name="moz-fx-data-derived-datasets-parquet-tmp",
        prefix="avro/mozaggregator/mobile/moz-fx-data-shared-prod/{{ ds_nodash }}/mobile_metrics_v1",
        google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
        dag=dag
    ).set_upstream(mobile_aggregate_view_dataproc)

    GoogleCloudStorageDeleteOperator(
        task_id="delete_saved_session_avro",
        bucket_name="moz-fx-data-derived-datasets-parquet-tmp",
        prefix="avro/mozaggregator/mobile/moz-fx-data-shared-prod/{{ ds_nodash }}/saved_session_v4",
        google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
        dag=dag
    ).set_upstream(mobile_aggregate_view_dataproc)

register_status(
    mobile_aggregate_view_dataproc,
    "Mobile Aggregates",
コード例 #13
0
    task_id="glam_client_probe_counts_extract",
    destination_table="glam_client_probe_counts_extract_v1",
    dataset_id=dataset_id,
    project_id="moz-fx-data-shared-prod",
    owner="*****@*****.**",
    email=[
        "*****@*****.**", "*****@*****.**",
        "*****@*****.**"
    ],
    date_partition_parameter=None,
    arguments=('--replace', ),
    dag=dag)

glam_gcs_delete_old_extracts = GoogleCloudStorageDeleteOperator(
    task_id="glam_gcs_delete_old_extracts",
    bucket_name=glam_bucket,
    prefix="extract-",
    google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
    dag=dag)

gcs_destination = "{}/extract-*.csv".format(glam_bucket)
glam_extract_to_csv = BigQueryToCloudStorageOperator(
    task_id="glam_extract_to_csv",
    source_project_dataset_table="glam_client_probe_counts_extract_v1",
    destination_cloud_storage_uris=gcs_destination,
    export_format="CSV",
    print_header=False,
    dag=dag)

wait_for_main_ping >> latest_versions

latest_versions >> clients_daily_scalar_aggregates