def test_execute(self, mock_s3_hook, mock_transfer_hook):
        """Test the execute function when the run is successful."""

        operator = S3ToGoogleCloudStorageTransferOperator(
            task_id=TASK_ID,
            s3_bucket=S3_BUCKET,
            gcs_bucket=GCS_BUCKET,
            project_id=PROJECT_ID,
        )

        mock_s3_hook.return_value.get_credentials.return_value = Credentials(
            access_key=ACCESS_KEY,
            secret_key=SECRET_KEY,
        )

        operator.execute(None)

        mock_transfer_hook.return_value.create_transfer_job.assert_called_once_with(
            project_id=PROJECT_ID,
            transfer_spec={
                'awsS3DataSource': {
                    'bucketName': S3_BUCKET,
                    'awsAccessKey': {
                        'accessKeyId': ACCESS_KEY,
                        'secretAccessKey': SECRET_KEY,
                    }
                },
                'gcsDataSink': {
                    'bucketName': GCS_BUCKET,
                },
                'objectConditions': {},
                'transferOptions': {}
            })
    def test_constructor(self):
        """Test S3ToGoogleCloudStorageTransferOperator instance is properly initialized."""

        operator = S3ToGoogleCloudStorageTransferOperator(
            task_id=TASK_ID,
            s3_bucket=S3_BUCKET,
            gcs_bucket=GCS_BUCKET,
            project_id=PROJECT_ID,
        )

        self.assertEqual(operator.task_id, TASK_ID)
        self.assertEqual(operator.s3_bucket, S3_BUCKET)
        self.assertEqual(operator.gcs_bucket, GCS_BUCKET)
        self.assertEqual(operator.project_id, PROJECT_ID)
Example #3
0
    def test_constructor(self):
        """Test S3ToGoogleCloudStorageTransferOperator instance is properly initialized."""

        operator = S3ToGoogleCloudStorageTransferOperator(
            task_id=TASK_ID,
            s3_bucket=S3_BUCKET,
            gcs_bucket=GCS_BUCKET,
            project_id=PROJECT_ID,
            description=DESCRIPTION,
            schedule=SCHEDULE,
        )

        self.assertEqual(operator.task_id, TASK_ID)
        self.assertEqual(operator.s3_bucket, S3_BUCKET)
        self.assertEqual(operator.gcs_bucket, GCS_BUCKET)
        self.assertEqual(operator.project_id, PROJECT_ID)
        self.assertEqual(operator.description, DESCRIPTION)
        self.assertEqual(operator.schedule, SCHEDULE)
Example #4
0
    def test_execute_skip_wait(self, mock_s3_hook, mock_transfer_hook):
        """Test the execute function and wait until transfer is complete."""

        operator = S3ToGoogleCloudStorageTransferOperator(
            task_id=TASK_ID,
            s3_bucket=S3_BUCKET,
            gcs_bucket=GCS_BUCKET,
            project_id=PROJECT_ID,
            description=DESCRIPTION,
            wait=False,
        )

        mock_s3_hook.return_value.get_credentials.return_value = Credentials(
            access_key=ACCESS_KEY,
            secret_key=SECRET_KEY,
        )

        operator.execute(None)

        mock_transfer_hook.return_value.create_transfer_job.assert_called_once_with(
            project_id=PROJECT_ID,
            description=DESCRIPTION,
            schedule=None,
            transfer_spec={
                'awsS3DataSource': {
                    'bucketName': S3_BUCKET,
                    'awsAccessKey': {
                        'accessKeyId': ACCESS_KEY,
                        'secretAccessKey': SECRET_KEY,
                    }
                },
                'gcsDataSink': {
                    'bucketName': GCS_BUCKET,
                },
                'objectConditions': {},
                'transferOptions': {}
            }
        )

        assert not mock_transfer_hook.return_value.wait_for_transfer_job.called
Example #5
0
    '--location=US',
    'load',
    '--source_format=CSV',
    '--skip_leading_rows=0',
    '--replace',
    "--field_delimiter=\001",
    'blpadi.adi_dimensional_by_date${{ ds_nodash }}',
    'gs://moz-fx-data-derived-datasets-blpadi/blpadi/{{ ds }}/*',
]

s3_to_gcs = S3ToGoogleCloudStorageTransferOperator(
    task_id='s3_to_gcs',
    s3_bucket='net-mozaws-data-us-west-2-data-analysis',
    gcs_bucket='moz-fx-data-derived-datasets-blpadi',
    description='blpadi copy from s3 to gcs',
    aws_conn_id='aws_data_iam_blpadi',
    gcp_conn_id=gcp_conn_id,
    project_id=connection.project_id,
    object_conditions=gcstj_object_conditions,
    transfer_options=gcstj_transfer_options,
    dag=blp_dag)

load_blpadi_to_bq = GKEPodOperator(task_id='bigquery_load',
                                   gcp_conn_id=gcp_conn_id,
                                   project_id=connection.project_id,
                                   location='us-central1-a',
                                   cluster_name='bq-load-gke-1',
                                   name='load-blpadi-to-bq',
                                   namespace='default',
                                   image='google/cloud-sdk:242.0.0-alpine',
                                   arguments=bq_args,
Example #6
0
def load_to_bigquery(parent_dag_name=None,
                     default_args=None,
                     dataset_s3_bucket=None,
                     aws_conn_id=None,
                     dataset=None,
                     dataset_version=None,
                     gke_cluster_name=None,
                     date_submission_col='submission_date_s3',
                     ds_type='ds_nodash',
                     dag_name='load_to_bigquery',
                     gke_location='us-central1-a',
                     gke_namespace='default',
                     docker_image='docker.io/mozilla/parquet2bigquery:20191017', # noqa
                     reprocess=False,
                     p2b_concurrency='10',
                     p2b_resume=False,
                     p2b_table_alias=None,
                     objects_prefix=None,
                     spark_gs_dataset_location=None,
                     bigquery_dataset='telemetry',
                     dataset_gcs_bucket='moz-fx-data-derived-datasets-parquet',
                     gcp_conn_id='google_cloud_derived_datasets',
                     cluster_by=(),
                     drop=(),
                     rename={},
                     replace=()):

    """ Load Parquet data into BigQuery. Used with SubDagOperator.

    We use S3ToGoogleCloudStorageTransferOperator to create a GCS Transfer
    Service job to transfer the AWS S3 parquet data into a GCS Bucket.
    Once that is completed we launch a Kubernates pod on a existing GKE
    cluster using the GKEPodOperator.

    :param str parent_dag_name:            parent dag name
    :param dict default_args:              dag configuration
    :param str dataset_s3_bucket:          source S3 Bucket
    :param str dataset_gcs_bucket:         destination GCS Bucket
    :param str aws_conn_id:                airflow connection id for S3 access
    :param str gcp_conn_id:                airflow connection id for GCP access
    :param str dataset:                    dataset name
    :param str dataset_version:            dataset version
    :param str date_submission_col:        dataset date submission column
    :param str ds_type:                    dataset format (ds or ds_nodash)
    :param str gke_location:               GKE cluster zone
    :param str gke_namespace:              GKE cluster namespace
    :param str docker_image:               docker image to use for GKE pod operations # noqa
    :param str bigquery_dataset:           bigquery load destination dataset
    :param str p2b_concurrency:            number of processes for parquet2bigquery load
    :param str p2b_table_alias:            override p2b table name with alias
    :param str p2b_resume                  allow resume support. defaults to False
    :param bool reprocess:                 enable dataset reprocessing defaults to False
    :param str objects_prefix:             custom objects_prefix to override defaults
    :param str spark_gs_dataset_location:  custom spark dataset load location to override defaults
    :param List[str] cluster_by:           top level fields to cluster by when creating destination table
    :param List[str] drop:                 top level fields to exclude from destination table
    :param Dict[str, str] rename:          top level fields to rename in destination table
    :param List[str] replace:              top level field replacement expressions

    :return airflow.models.DAG
    """

    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    _dag_name = '{}.{}'.format(parent_dag_name, dag_name)

    if objects_prefix:
        _objects_prefix = objects_prefix
    else:
        _objects_prefix = '{}/{}/{}={{{{{}}}}}'.format(dataset,
                                                       dataset_version,
                                                       date_submission_col,
                                                       ds_type)
    gcs_buckets = {
        'transfer': dataset_gcs_bucket,
        'load': dataset_gcs_bucket,
    }

    gcstj_object_conditions = {
        'includePrefixes': _objects_prefix
    }

    gcstj_transfer_options = {
        'deleteObjectsUniqueInSink': True
    }

    gke_args = [
        '-d', bigquery_dataset,
        '-c', p2b_concurrency,
        '-b', gcs_buckets['load'],
        ]

    if not p2b_resume:
        gke_args += ['-R']

    if p2b_table_alias:
        gke_args += ['-a', p2b_table_alias]

    if reprocess:
        reprocess_objects_prefix = _objects_prefix.replace('_nodash', '')
        gcs_buckets['transfer'] += '-tmp'
        gke_args += ['-p', reprocess_objects_prefix]

    else:
        gke_args += ['-p', _objects_prefix]

    if cluster_by:
        gke_args += ['--cluster-by'] + cluster_by

    if drop:
        gke_args += ['--drop'] + drop

    if rename:
        gke_args += ['--rename'] + [k + "=" + v for k, v in rename.items()]

    if replace:
        gke_args += ['--replace'] + replace

    bq_table_name = p2b_table_alias or normalize_table_id('_'.join([dataset,
                                                                   dataset_version]))

    with models.DAG(_dag_name, default_args=default_args) as dag:
        if dataset_s3_bucket is not None:
            s3_to_gcs = S3ToGoogleCloudStorageTransferOperator(
                task_id='s3_to_gcs',
                s3_bucket=dataset_s3_bucket,
                gcs_bucket=gcs_buckets['transfer'],
                description=_objects_prefix,
                aws_conn_id=aws_conn_id,
                gcp_conn_id=gcp_conn_id,
                project_id=connection.project_id,
                object_conditions=gcstj_object_conditions,
                transfer_options=gcstj_transfer_options,
            )
        else:
            s3_to_gcs = DummyOperator(task_id='no_s3_to_gcs')

        reprocess = SubDagOperator(
            subdag=reprocess_parquet(
                _dag_name,
                default_args,
                reprocess,
                gcp_conn_id,
                gcs_buckets,
                _objects_prefix,
                date_submission_col,
                dataset,
                dataset_version,
                gs_dataset_location=spark_gs_dataset_location),
            task_id='reprocess_parquet')

        remove_bq_table = BigQueryTableDeleteOperator(
            task_id='remove_bq_table',
            bigquery_conn_id=gcp_conn_id,
            deletion_dataset_table='{}.{}${{{{ds_nodash}}}}'.format(bigquery_dataset, bq_table_name), # noqa
            ignore_if_missing=True,
        )

        bulk_load = GKEPodOperator(
            task_id='bigquery_load',
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            location=gke_location,
            cluster_name=gke_cluster_name,
            name=_dag_name.replace('_', '-'),
            namespace=gke_namespace,
            image=docker_image,
            arguments=gke_args,
            )

        s3_to_gcs >> reprocess >> remove_bq_table >> bulk_load

        return dag
project_id = 'my-gcp-project'
gcs_bucket = 's3-to-bq-' + s3_bucket # temporary bucket to store file
dag_id = re.sub('[^0-9a-zA-Z]+', '_', include_prefix) # use s3 prefix as dag name so it is unique

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': start_date,
    'end_date': end_date,
    'email': email_alert,
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

with DAG(dag_id, schedule_interval=schedule_interval,
        default_args=default_args) as dag:

    s3_to_gcs = S3ToGoogleCloudStorageTransferOperator(
        task_id='s3_to_gcs',
        s3_bucket=s3_bucket,
        project_id=project_id,
        gcs_bucket=gcs_bucket,
        description= '_'.join([gcs_bucket, include_prefix]),
        object_conditions={ 'include_prefixes': [ include_prefix ] },
        replace=True
    )

    s3_to_gcs
Example #8
0
gcs_data_bucket = 'moz-fx-data-prod-socorro-data'

dataset = 'socorro_crash'
dataset_version = 'v2'
date_submission_col = 'crash_date'

objects_prefix = '{}/{}/{}={}'.format(dataset, dataset_version,
                                      date_submission_col, "{{ ds_nodash }}")

# copy json crashstats from s3 to gcs
s3_to_gcs = S3ToGoogleCloudStorageTransferOperator(
    task_id='s3_to_gcs',
    s3_bucket='crashstats-telemetry-crashes-prod-us-west-2',
    gcs_bucket=gcs_data_bucket,
    description='socorro crash report copy from s3 to gcs',
    aws_conn_id=read_aws_conn_id,
    gcp_conn_id=gcp_conn_id,
    project_id=connection.project_id,
    object_conditions={'includePrefixes': 'v1/crash_report/{{ ds_nodash }}'},
    transfer_options={'deleteObjectsUniqueInSink': True},
    dag=dag,
)

# Spark job reads gcs json and writes gcs parquet
crash_report_parquet = SubDagOperator(
    task_id="crash_report_parquet",
    dag=dag,
    subdag=moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,
        dag_name='crash_report_parquet',
        default_args=default_args,
        cluster_name=cluster_name,