Example #1
0
def s3_to_gcs():
    tasks = []
    file_list = set()
    ACCESS_KEY_ID = Variable.get(key="ACCESS_KEY_ID")
    SECRET_ACCESS_KEY = Variable.get(key="SECRET_ACCESS_KEY")

    session = boto3.Session(aws_access_key_id=ACCESS_KEY_ID,
                            aws_secret_access_key=SECRET_ACCESS_KEY)

    s3 = session.client('s3')
    bucket = s3.list_objects_v2(Bucket=s3_bucket)

    for obj in bucket['Contents']:
        #     if obj['Size']>0:
        file_list.add(obj['Key'].split('/')[0])

    # file_list_1 = ['ACL']

    for folder in file_list:
        task_id = f"load_from_S3_{folder}"
        new_task = S3ToGoogleCloudStorageTransferOperator(
            aws_conn_id='aws_default',
            task_id=task_id,
            s3_bucket=s3_bucket,
            gcs_bucket=gcs_bucket,
            description=f"Transfer unloaded data from S3 for {folder}",
            object_conditions={'include_prefixes': [folder]},
            timeout=60,
            wait=1)
        tasks.append(new_task)
    return tasks
Example #2
0
    def test_templates(self, _):
        dag_id = 'test_dag_id'
        args = {'start_date': DEFAULT_DATE}
        self.dag = DAG(dag_id, default_args=args)  # pylint:disable=attribute-defined-outside-init
        op = S3ToGoogleCloudStorageTransferOperator(
            s3_bucket='{{ dag.dag_id }}',
            gcs_bucket='{{ dag.dag_id }}',
            description='{{ dag.dag_id }}',
            object_conditions={'exclude_prefixes': ['{{ dag.dag_id }}']},
            gcp_conn_id='{{ dag.dag_id }}',
            task_id=TASK_ID,
            dag=self.dag,
        )
        ti = TaskInstance(op, DEFAULT_DATE)
        ti.render_templates()
        self.assertEqual(dag_id, getattr(op, 's3_bucket'))
        self.assertEqual(dag_id, getattr(op, 'gcs_bucket'))
        self.assertEqual(dag_id, getattr(op, 'description'))

        # pylint:disable=unsubscriptable-object
        self.assertEqual(
            dag_id,
            getattr(op, 'object_conditions')['exclude_prefixes'][0])
        # pylint:enable=unsubscriptable-object

        self.assertEqual(dag_id, getattr(op, 'gcp_conn_id'))
Example #3
0
    def test_constructor(self):
        operator = S3ToGoogleCloudStorageTransferOperator(
            task_id=TASK_ID,
            s3_bucket=AWS_BUCKET_NAME,
            gcs_bucket=GCS_BUCKET_NAME,
            project_id=GCP_PROJECT_ID,
            description=DESCRIPTION,
            schedule=SCHEDULE_DICT,
        )

        self.assertEqual(operator.task_id, TASK_ID)
        self.assertEqual(operator.s3_bucket, AWS_BUCKET_NAME)
        self.assertEqual(operator.gcs_bucket, GCS_BUCKET_NAME)
        self.assertEqual(operator.project_id, GCP_PROJECT_ID)
        self.assertEqual(operator.description, DESCRIPTION)
        self.assertEqual(operator.schedule, SCHEDULE_DICT)
def create_dag(dag, folder, default_dag_args=None):

    #S3 to GCS transfer
    s3_to_gcs = S3ToGoogleCloudStorageTransferOperator(
        dag=dag,
        task_id=f"s3_to_gcs_{folder}",
        s3_bucket=s3_bucket,
        gcs_bucket=gcs_bucket,
        description="Transfer unloaded data from S3",
        object_conditions={'include_prefixes': [folder]},
        timeout=60,
        wait=1)

    # Arrange the DAG
    s3_to_gcs

    return dag
Example #5
0
    def test_execute(self, mock_aws_hook, mock_transfer_hook):
        mock_aws_hook.return_value.get_credentials.return_value = Credentials(
            TEST_AWS_ACCESS_KEY_ID, TEST_AWS_ACCESS_SECRET, None)

        operator = S3ToGoogleCloudStorageTransferOperator(
            task_id=TASK_ID,
            s3_bucket=AWS_BUCKET_NAME,
            gcs_bucket=GCS_BUCKET_NAME,
            description=DESCRIPTION,
            schedule=SCHEDULE_DICT,
        )

        operator.execute(None)

        mock_transfer_hook.return_value.create_transfer_job.assert_called_once_with(
            body=VALID_TRANSFER_JOB_AWS_RAW)

        self.assertTrue(
            mock_transfer_hook.return_value.wait_for_transfer_job.called)
 def test_templates(self, _):
     dag_id = 'test_dag_id'
     configuration.load_test_config()
     args = {'start_date': DEFAULT_DATE}
     self.dag = DAG(dag_id, default_args=args)
     op = S3ToGoogleCloudStorageTransferOperator(
         s3_bucket='{{ dag.dag_id }}',
         gcs_bucket='{{ dag.dag_id }}',
         description='{{ dag.dag_id }}',
         object_conditions={'exclude_prefixes': ['{{ dag.dag_id }}']},
         gcp_conn_id='{{ dag.dag_id }}',
         task_id=TASK_ID,
         dag=self.dag,
     )
     ti = TaskInstance(op, DEFAULT_DATE)
     ti.render_templates()
     self.assertEqual(dag_id, getattr(op, 's3_bucket'))
     self.assertEqual(dag_id, getattr(op, 'gcs_bucket'))
     self.assertEqual(dag_id, getattr(op, 'description'))
     self.assertEqual(dag_id, getattr(op, 'object_conditions')['exclude_prefixes'][0])
     self.assertEqual(dag_id, getattr(op, 'gcp_conn_id'))
dataset = "socorro_crash"
dataset_version = "v2"
date_submission_col = "crash_date"

objects_prefix = "{}/{}/{}={}".format(dataset, dataset_version,
                                      date_submission_col, "{{ ds_nodash }}")

# copy json crashstats from s3 to gcs
s3_to_gcs = S3ToGoogleCloudStorageTransferOperator(
    task_id="s3_to_gcs",
    s3_bucket="crashstats-telemetry-crashes-prod-us-west-2",
    gcs_bucket=gcs_data_bucket,
    description="socorro crash report copy from s3 to gcs",
    aws_conn_id=read_aws_conn_id,
    gcp_conn_id=gcp_conn_id,
    project_id=connection.project_id,
    object_conditions={"includePrefixes": "v1/crash_report/{{ ds_nodash }}"},
    transfer_options={"deleteObjectsUniqueInSink": True},
    timeout=3600,
    dag=dag,
)

# Spark job reads gcs json and writes gcs parquet
crash_report_parquet = SubDagOperator(
    task_id="crash_report_parquet",
    dag=dag,
    subdag=moz_dataproc_pyspark_runner(
        parent_dag_name=dag.dag_id,
        dag_name="crash_report_parquet",
        default_args=default_args,
    '--location=US',
    'load',
    '--source_format=CSV',
    '--skip_leading_rows=0',
    '--replace',
    "--field_delimiter=\001",
    'blpadi.adi_dimensional_by_date${{ ds_nodash }}',
    'gs://moz-fx-data-derived-datasets-blpadi/blpadi/{{ ds }}/*',
]

s3_to_gcs = S3ToGoogleCloudStorageTransferOperator(
    task_id='s3_to_gcs',
    s3_bucket='net-mozaws-data-us-west-2-data-analysis',
    gcs_bucket='moz-fx-data-derived-datasets-blpadi',
    description='blpadi copy from s3 to gcs',
    aws_conn_id='aws_data_iam_blpadi',
    gcp_conn_id=gcp_conn_id,
    project_id=connection.project_id,
    object_conditions=gcstj_object_conditions,
    transfer_options=gcstj_transfer_options,
    timeout=720,
    dag=blp_dag)

load_blpadi_to_bq = GKEPodOperator(task_id='bigquery_load',
                                   name='load-blpadi-to-bq',
                                   image='google/cloud-sdk:242.0.0-alpine',
                                   arguments=bq_args,
                                   dag=blp_dag)

blp_logs.set_downstream(blp_job_sensor)
blp_job_sensor.set_downstream(s3_to_gcs)
s3_to_gcs.set_downstream(load_blpadi_to_bq)
Example #9
0
def load_to_bigquery(parent_dag_name=None,
                     default_args=None,
                     dataset_s3_bucket=None,
                     aws_conn_id=None,
                     dataset=None,
                     dataset_version=None,
                     gke_cluster_name=None,
                     date_submission_col='submission_date_s3',
                     ds_type='ds_nodash',
                     dag_name='load_to_bigquery',
                     gke_location='us-central1-a',
                     gke_namespace='default',
                     docker_image='docker.io/mozilla/parquet2bigquery:20191017', # noqa
                     reprocess=False,
                     p2b_concurrency='10',
                     p2b_resume=False,
                     p2b_table_alias=None,
                     objects_prefix=None,
                     spark_gs_dataset_location=None,
                     bigquery_dataset='telemetry',
                     dataset_gcs_bucket='moz-fx-data-derived-datasets-parquet',
                     gcp_conn_id='google_cloud_derived_datasets',
                     cluster_by=(),
                     drop=(),
                     rename={},
                     replace=()):

    """ Load Parquet data into BigQuery. Used with SubDagOperator.

    We use S3ToGoogleCloudStorageTransferOperator to create a GCS Transfer
    Service job to transfer the AWS S3 parquet data into a GCS Bucket.
    Once that is completed we launch a Kubernates pod on a existing GKE
    cluster using the GKEPodOperator.

    :param str parent_dag_name:            parent dag name
    :param dict default_args:              dag configuration
    :param str dataset_s3_bucket:          source S3 Bucket
    :param str dataset_gcs_bucket:         destination GCS Bucket
    :param str aws_conn_id:                airflow connection id for S3 access
    :param str gcp_conn_id:                airflow connection id for GCP access
    :param str dataset:                    dataset name
    :param str dataset_version:            dataset version
    :param str date_submission_col:        dataset date submission column
    :param str ds_type:                    dataset format (ds or ds_nodash)
    :param str gke_location:               GKE cluster zone
    :param str gke_namespace:              GKE cluster namespace
    :param str docker_image:               docker image to use for GKE pod operations # noqa
    :param str bigquery_dataset:           bigquery load destination dataset
    :param str p2b_concurrency:            number of processes for parquet2bigquery load
    :param str p2b_table_alias:            override p2b table name with alias
    :param str p2b_resume                  allow resume support. defaults to False
    :param bool reprocess:                 enable dataset reprocessing defaults to False
    :param str objects_prefix:             custom objects_prefix to override defaults
    :param str spark_gs_dataset_location:  custom spark dataset load location to override defaults
    :param List[str] cluster_by:           top level fields to cluster by when creating destination table
    :param List[str] drop:                 top level fields to exclude from destination table
    :param Dict[str, str] rename:          top level fields to rename in destination table
    :param List[str] replace:              top level field replacement expressions

    :return airflow.models.DAG
    """

    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    _dag_name = '{}.{}'.format(parent_dag_name, dag_name)

    if objects_prefix:
        _objects_prefix = objects_prefix
    else:
        _objects_prefix = '{}/{}/{}={{{{{}}}}}'.format(dataset,
                                                       dataset_version,
                                                       date_submission_col,
                                                       ds_type)
    gcs_buckets = {
        'transfer': dataset_gcs_bucket,
        'load': dataset_gcs_bucket,
    }

    gcstj_object_conditions = {
        'includePrefixes': _objects_prefix
    }

    gcstj_transfer_options = {
        'deleteObjectsUniqueInSink': True
    }

    gke_args = [
        '-d', bigquery_dataset,
        '-c', p2b_concurrency,
        '-b', gcs_buckets['load'],
        ]

    if not p2b_resume:
        gke_args += ['-R']

    if p2b_table_alias:
        gke_args += ['-a', p2b_table_alias]

    if reprocess:
        reprocess_objects_prefix = _objects_prefix.replace('_nodash', '')
        gcs_buckets['transfer'] += '-tmp'
        gke_args += ['-p', reprocess_objects_prefix]

    else:
        gke_args += ['-p', _objects_prefix]

    if cluster_by:
        gke_args += ['--cluster-by'] + cluster_by

    if drop:
        gke_args += ['--drop'] + drop

    if rename:
        gke_args += ['--rename'] + [k + "=" + v for k, v in rename.items()]

    if replace:
        gke_args += ['--replace'] + replace

    bq_table_name = p2b_table_alias or normalize_table_id('_'.join([dataset,
                                                                   dataset_version]))

    with models.DAG(_dag_name, default_args=default_args) as dag:
        if dataset_s3_bucket is not None:
            s3_to_gcs = S3ToGoogleCloudStorageTransferOperator(
                task_id='s3_to_gcs',
                s3_bucket=dataset_s3_bucket,
                gcs_bucket=gcs_buckets['transfer'],
                description=_objects_prefix,
                aws_conn_id=aws_conn_id,
                gcp_conn_id=gcp_conn_id,
                project_id=connection.project_id,
                object_conditions=gcstj_object_conditions,
                transfer_options=gcstj_transfer_options,
                timeout=3600,
            )
        else:
            s3_to_gcs = DummyOperator(task_id='no_s3_to_gcs')

        reprocess = SubDagOperator(
            subdag=reprocess_parquet(
                _dag_name,
                default_args,
                reprocess,
                gcp_conn_id,
                gcs_buckets,
                _objects_prefix,
                date_submission_col,
                dataset,
                dataset_version,
                gs_dataset_location=spark_gs_dataset_location),
            task_id='reprocess_parquet')

        remove_bq_table = BigQueryTableDeleteOperator(
            task_id='remove_bq_table',
            bigquery_conn_id=gcp_conn_id,
            deletion_dataset_table='{}.{}${{{{ds_nodash}}}}'.format(bigquery_dataset, bq_table_name), # noqa
            ignore_if_missing=True,
        )

        bulk_load = GKEPodOperator(
            task_id='bigquery_load',
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            location=gke_location,
            cluster_name=gke_cluster_name,
            name=_dag_name.replace('_', '-'),
            namespace=gke_namespace,
            image=docker_image,
            arguments=gke_args,
            )

        s3_to_gcs >> reprocess >> remove_bq_table >> bulk_load

        return dag
    'schedule_interval': None,
    'email': '*****@*****.**'
}

#Read Airflow Variable
config = Variable.get("bioInfo_s3_to_gcs_config", deserialize_json=True)

#AWS Variables
s3_bucket = config["s3_bucket"]

#GCS Variables
gcs_bucket = config["gcs_bucket"]

gcs_include_prefix = '{{dag_run.conf["gcs_include_prefix"]}}'

#Start Tasks
with models.DAG('s3_to_gcs_prefix',
                max_active_runs=1,
                default_args=default_dag_args) as dag:

    s3_to_gcs = S3ToGoogleCloudStorageTransferOperator(
        task_id='s3_to_gcs',
        s3_bucket=s3_bucket,
        gcs_bucket=gcs_bucket,
        description="Transfer unloaded data from S3",
        object_conditions={'include_prefixes': [gcs_include_prefix]},
        timeout=60,
        wait=1)

    #Dag creation
    s3_to_gcs