Exemple #1
0
    def test_execute_without_replace(self, mock_hook, mock_hook2):
        mock_hook.return_value.list.return_value = MOCK_FILES
        mock_hook.return_value.download.return_value = b"testing"
        mock_hook2.return_value.list.return_value = MOCK_FILES

        operator = GoogleCloudStorageToS3Operator(task_id=TASK_ID,
                                                  bucket=GCS_BUCKET,
                                                  prefix=PREFIX,
                                                  delimiter=DELIMITER,
                                                  dest_aws_conn_id=None,
                                                  dest_s3_key=S3_BUCKET,
                                                  replace=False)
        # create dest bucket with all the files
        hook = S3Hook(aws_conn_id=None)
        b = hook.get_bucket('bucket')
        b.create()
        [b.put_object(Key=MOCK_FILE, Body=b'testing') for MOCK_FILE in MOCK_FILES]

        # we expect nothing to be uploaded
        # and all the MOCK_FILES to be present at the S3 bucket
        uploaded_files = operator.execute(None)
        self.assertEqual([],
                         uploaded_files)
        self.assertEqual(sorted(MOCK_FILES),
                         sorted(hook.list_keys('bucket', delimiter='/')))
def export_to_amplitude(
        parent_dag_name,
        dag_name,
        default_args,
        project,
        dataset,
        table_or_view,
        s3_prefix,
        gcs_bucket='moz-fx-data-derived-datasets-amplitude-export',
        gcp_conn_id='google_cloud_derived_datasets',
        amplitude_s3_conn='amplitude_s3_conn',
        amplitude_s3_bucket='com-amplitude-vacuum-mozilla-vacuum-wup'):

    """Export a bigquery table or view to Amplitude.

    This uses the BigQueryToCloudStorage operator to export the
    partition to GCS, then pushes that data to S3. It operates
    on a temporary table that is dropped after the job is finished.

    :param str parent_dag_name: Parent dag name
    :param str dag_name: This dag's name (appended to parent_dag_name)
    :param str default_args: DAG configuration
    :param str dataset: BigQuery project containing the table to be exported
    :param str dataset: BigQuery dataset
    :param str table_or_view: Table or view name
    :param str gcs_bucket: The bucket the data will be exported to
    :param str gcp_conn_id: GCP connection ID
    :param str amplitude_s3_conn: S3 connection ID
    :param str amplitude_s3_bucket: The bucket to export data to
    :param str s3_prefix: The prefix for the s3 objects
    """

    environment = environ['DEPLOY_ENVIRONMENT']
    _dag_name = '{}.{}'.format(parent_dag_name, dag_name)

    with models.DAG(_dag_name, default_args=default_args) as dag:
        # For now, we assume the view is already updated
        # See https://github.com/mozilla/bigquery-etl/issues/218

        exec_date = '{{ ds }}'

        # Check that we have data for this date
        check_sql = (
            'SELECT COUNT(*) '
            'FROM `{}.{}.{}` '
            'WHERE DATE(submission_timestamp) = "{}"'
        ).format(project, dataset, table_or_view, exec_date)

        wait_for_data = BigQuerySQLSensorOperator(
                task_id='wait_for_data',
                sql=check_sql,
                bigquery_conn_id=gcp_conn_id,
                use_legacy_sql=False
        )

        # Create the table with yesterday's data
        project_id = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id
        temp_table_name = table_or_view + '_{{ ds_nodash }}'
        fully_qualified_table_name = '{}.{}.{}'.format(project_id, dataset, temp_table_name)

        sql = (
            'SELECT * EXCEPT (submission_timestamp) '
            'FROM `{}.{}.{}` '
            'WHERE DATE(submission_timestamp) = "{}"'
        ).format(project, dataset, table_or_view, exec_date)

        create_table = BigQueryOperator(
            task_id='create_temporary_table',
            sql=sql,
            destination_dataset_table=fully_qualified_table_name,
            bigquery_conn_id=gcp_conn_id,
            use_legacy_sql=False
        )

        directory = '/'.join((environment, s3_prefix, '{{ ds_nodash }}'))
        extension = '.tsv.gz'

        # Export from bq to gcs
        # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/bigquery_to_gcs.py#L28 # noqa: E501
        gcs_uri = 'gs://{}/{}/*{}'.format(gcs_bucket, directory, extension)
        table_extract = BigQueryToCloudStorageOperator(
            task_id='bq_to_gcs',
            source_project_dataset_table=fully_qualified_table_name,
            destination_cloud_storage_uris=[gcs_uri],
            bigquery_conn_id=gcp_conn_id,
            compression='GZIP',
            export_format='CSV',
            field_delimiter='\t',
            print_header=True
        )

        # Push the data to S3
        # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/gcs_to_s3.py#L29 # noqa: E501
        s3_push = GoogleCloudStorageToS3Operator(
            task_id='gcs_to_s3',
            bucket=gcs_bucket,
            prefix=directory,
            delimiter=extension,
            google_cloud_storage_conn_id=gcp_conn_id,
            dest_aws_conn_id=amplitude_s3_conn,
            dest_s3_key='s3://{}/'.format(amplitude_s3_bucket),
            replace=True
        )

        # Drop the temporary table
        table_drop = BigQueryOperator(
            task_id='drop_temp_table',
            sql='DROP TABLE `{}`'.format(fully_qualified_table_name),
            bigquery_conn_id=gcp_conn_id,
            use_legacy_sql=False
        )

        # Delete the GCS data
        data_delete = GoogleCloudStorageDeleteOperator(
            task_id='delete_gcs_data',
            bucket_name=gcs_bucket,
            prefix=directory,
            gcp_conn_id=gcp_conn_id
        )

        wait_for_data >> create_table >> table_extract >> s3_push
        s3_push >> table_drop
        s3_push >> data_delete

        return dag
Exemple #3
0
    # Export from bq to gcs
    # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/bigquery_to_gcs.py#L28
    table_extract = BigQueryToCloudStorageOperator(
        task_id='bq_to_gcs',
        source_project_dataset_table='{project}.telemetry.{table}'.format(
            project=project_id, table=table_name),
        destination_cloud_storage_uris=[
            'gs://{bucket}/{dir}*{ext}'.format(bucket=gcs_bucket,
                                               dir=directory,
                                               ext=extension)
        ],
        bigquery_conn_id=gcp_conn_id,
        compression='GZIP',
        export_format='CSV',
        field_delimiter='\t',
        print_header=True)

    # Push the data to S3
    # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/gcs_to_s3.py#L29
    s3_push = GoogleCloudStorageToS3Operator(
        task_id='gcs_to_s3',
        bucket=gcs_bucket,
        prefix=directory,
        delimiter=extension,
        google_cloud_storage_conn_id=gcp_conn_id,
        dest_aws_conn_id=amplitude_s3_conn,
        dest_s3_key='s3://{bucket}/'.format(bucket=amplitude_s3_bucket),
        replace=True)

    table_creation >> table_extract >> s3_push
Exemple #4
0
from airflow import DAG
from airflow.operators.email_operator import EmailOperator
from airflow.contrib.operators.gcs_to_s3 import GoogleCloudStorageToS3Operator

default_args = {
    "owner": "dev",
    "start_date": datetime(2020, 9, 27),
    "end_date": datetime(2020, 9, 30),
    "depends_on_past": False,
}
dag = DAG(dag_id="gcs_to_s3",
          default_args=default_args,
          schedule_interval="@daily")
t1 = GoogleCloudStorageToS3Operator(
    task_id="copy_data",
    bucket="img_objects_to_audio",
    google_cloud_storage_conn_id="gcs_conn",
    dest_aws_conn_id="aws_conn",
    dest_s3_key="s3://img-objects-to-audio/",
    dag=dag,
)
t2 = EmailOperator(
    task_id="send_conformation_mail",
    to="*****@*****.**",
    subject="data uploaded from gcs to s3",
    html_content="<h2>transfer operation is complated </h2>",
    dag=dag,
)

t1 >> t2
Exemple #5
0
# To have bucket names parametrized
AWS_BUCKET = Variable.get("AWS_BUCKET")
GCP_BUCKET = Variable.get("GCP_BUCKET")

def_args = {'start_date': days_ago(1), 'owner': 'cm0'}

transfer_dag = DAG(dag_id=DAG_ID,
                   schedule_interval=None,
                   default_args=def_args)

start = DummyOperator(task_id="start", dag=transfer_dag)

transfer_operator = GoogleCloudStorageToS3Operator(
    task_id="transfer_gcp_to_aws",
    bucket=GCP_BUCKET,
    dest_s3_key=AWS_BUCKET,
    replace=True,
    dag=transfer_dag)

end = DummyOperator(task_id="end", dag=transfer_dag)

# For troubleshooting GCP and AWS resources access, you can uncomment below operators which will list
# both buckets. Not recommendable for recursive listing or for buckets without inner folders (where there are
# many files directly on the bucket root). Also required to comment line 41.

# list_gcs = GoogleCloudStorageListOperator(task_id="list_gcs", bucket=GCP_BUCKET, dag=transfer_dag)
# list_s3 = S3ListOperator(task_id="list_s3", bucket=AWS_BUCKET, dag=transfer_dag)
# start >> list_gcs >> list_s3 >> transfer_operator >> end

start >> transfer_operator >> end
Exemple #6
0
    sql=bq_extract_query.format(source_table=source_table),
    destination_dataset_table=target_table)

extract_ga360_to_gcs = BigQueryToCloudStorageOperator(
    dag=dag,
    task_id='export_recent_yesterday_to_gcs',
    bigquery_conn_id=google_conn_id,
    source_project_dataset_table=target_table,
    destination_cloud_storage_uris=[gcs_output_file],
    compression='GZIP',
    export_format='CSV')

export_gcs_to_s3 = GoogleCloudStorageToS3Operator(
    dag=dag,
    task_id="cp_gcs_to_s3",
    dest_verify=True,
    google_cloud_storage_conn_id=google_conn_id,
    bucket=gcs_bucket,
    dest_aws_conn_id='local_s3',
    dest_s3_key=redshift_s3_bucket)

load_redshift = S3ToRedshiftTransfer(dag=dag,
                                     task_id="redshift_load",
                                     redshift_conn_id=redshift_conn_id,
                                     s3_file=s3_output_file,
                                     schema='public',
                                     table='ga360_sessions',
                                     iam_role=redshift_iam_role,
                                     copy_options=[
                                         'CSV', 'IGNOREHEADER 1', 'GZIP',
                                         """DATEFORMAT AS 'YYYYMMDD'"""
                                     ])