Esempio n. 1
0
def burnham_sensor(task_id, sql, gcp_conn_id=DEFAULT_GCP_CONN_ID, **kwargs):
    """Create a new BigQuerySQLSensorOperator that checks for burnham data.

    :param str task_id:                 [Required] ID for the task
    :param str sql:                     [Required] SQL for the sensor

    :param str gcp_conn_id:             Airflow connection id for GCP access
    :param Dict[str, Any] kwargs:       Additional kwargs for BigQuerySQLSensorOperator

    :return: BigQuerySQLSensorOperator
    """
    return BigQuerySQLSensorOperator(
        task_id=task_id,
        sql=sql,
        bigquery_conn_id=gcp_conn_id,
        use_legacy_sql=False,
        **kwargs,
    )
Esempio n. 2
0
def export_to_amplitude(
        parent_dag_name,
        dag_name,
        default_args,
        project,
        dataset,
        table_or_view,
        s3_prefix,
        gcs_bucket='moz-fx-data-derived-datasets-amplitude-export',
        gcp_conn_id='google_cloud_derived_datasets',
        amplitude_s3_conn='amplitude_s3_conn',
        amplitude_s3_bucket='com-amplitude-vacuum-mozilla-vacuum-wup'):

    """Export a bigquery table or view to Amplitude.

    This uses the BigQueryToCloudStorage operator to export the
    partition to GCS, then pushes that data to S3. It operates
    on a temporary table that is dropped after the job is finished.

    :param str parent_dag_name: Parent dag name
    :param str dag_name: This dag's name (appended to parent_dag_name)
    :param str default_args: DAG configuration
    :param str dataset: BigQuery project containing the table to be exported
    :param str dataset: BigQuery dataset
    :param str table_or_view: Table or view name
    :param str gcs_bucket: The bucket the data will be exported to
    :param str gcp_conn_id: GCP connection ID
    :param str amplitude_s3_conn: S3 connection ID
    :param str amplitude_s3_bucket: The bucket to export data to
    :param str s3_prefix: The prefix for the s3 objects
    """

    environment = environ['DEPLOY_ENVIRONMENT']
    _dag_name = '{}.{}'.format(parent_dag_name, dag_name)

    with models.DAG(_dag_name, default_args=default_args) as dag:
        # For now, we assume the view is already updated
        # See https://github.com/mozilla/bigquery-etl/issues/218

        exec_date = '{{ ds }}'

        # Check that we have data for this date
        check_sql = (
            'SELECT COUNT(*) '
            'FROM `{}.{}.{}` '
            'WHERE DATE(submission_timestamp) = "{}"'
        ).format(project, dataset, table_or_view, exec_date)

        wait_for_data = BigQuerySQLSensorOperator(
                task_id='wait_for_data',
                sql=check_sql,
                bigquery_conn_id=gcp_conn_id,
                use_legacy_sql=False
        )

        # Create the table with yesterday's data
        project_id = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id
        temp_table_name = table_or_view + '_{{ ds_nodash }}'
        fully_qualified_table_name = '{}.{}.{}'.format(project_id, dataset, temp_table_name)

        sql = (
            'SELECT * EXCEPT (submission_timestamp) '
            'FROM `{}.{}.{}` '
            'WHERE DATE(submission_timestamp) = "{}"'
        ).format(project, dataset, table_or_view, exec_date)

        create_table = BigQueryOperator(
            task_id='create_temporary_table',
            sql=sql,
            destination_dataset_table=fully_qualified_table_name,
            bigquery_conn_id=gcp_conn_id,
            use_legacy_sql=False
        )

        directory = '/'.join((environment, s3_prefix, '{{ ds_nodash }}'))
        extension = '.tsv.gz'

        # Export from bq to gcs
        # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/bigquery_to_gcs.py#L28 # noqa: E501
        gcs_uri = 'gs://{}/{}/*{}'.format(gcs_bucket, directory, extension)
        table_extract = BigQueryToCloudStorageOperator(
            task_id='bq_to_gcs',
            source_project_dataset_table=fully_qualified_table_name,
            destination_cloud_storage_uris=[gcs_uri],
            bigquery_conn_id=gcp_conn_id,
            compression='GZIP',
            export_format='CSV',
            field_delimiter='\t',
            print_header=True
        )

        # Push the data to S3
        # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/gcs_to_s3.py#L29 # noqa: E501
        s3_push = GoogleCloudStorageToS3Operator(
            task_id='gcs_to_s3',
            bucket=gcs_bucket,
            prefix=directory,
            delimiter=extension,
            google_cloud_storage_conn_id=gcp_conn_id,
            dest_aws_conn_id=amplitude_s3_conn,
            dest_s3_key='s3://{}/'.format(amplitude_s3_bucket),
            replace=True
        )

        # Drop the temporary table
        table_drop = BigQueryOperator(
            task_id='drop_temp_table',
            sql='DROP TABLE `{}`'.format(fully_qualified_table_name),
            bigquery_conn_id=gcp_conn_id,
            use_legacy_sql=False
        )

        # Delete the GCS data
        data_delete = GoogleCloudStorageDeleteOperator(
            task_id='delete_gcs_data',
            bucket_name=gcs_bucket,
            prefix=directory,
            gcp_conn_id=gcp_conn_id
        )

        wait_for_data >> create_table >> table_extract >> s3_push
        s3_push >> table_drop
        s3_push >> data_delete

        return dag