def s3_to_gcs(): tasks = [] file_list = set() ACCESS_KEY_ID = Variable.get(key="ACCESS_KEY_ID") SECRET_ACCESS_KEY = Variable.get(key="SECRET_ACCESS_KEY") session = boto3.Session(aws_access_key_id=ACCESS_KEY_ID, aws_secret_access_key=SECRET_ACCESS_KEY) s3 = session.client('s3') bucket = s3.list_objects_v2(Bucket=s3_bucket) for obj in bucket['Contents']: # if obj['Size']>0: file_list.add(obj['Key'].split('/')[0]) # file_list_1 = ['ACL'] for folder in file_list: task_id = f"load_from_S3_{folder}" new_task = S3ToGoogleCloudStorageTransferOperator( aws_conn_id='aws_default', task_id=task_id, s3_bucket=s3_bucket, gcs_bucket=gcs_bucket, description=f"Transfer unloaded data from S3 for {folder}", object_conditions={'include_prefixes': [folder]}, timeout=60, wait=1) tasks.append(new_task) return tasks
def test_templates(self, _): dag_id = 'test_dag_id' args = {'start_date': DEFAULT_DATE} self.dag = DAG(dag_id, default_args=args) # pylint:disable=attribute-defined-outside-init op = S3ToGoogleCloudStorageTransferOperator( s3_bucket='{{ dag.dag_id }}', gcs_bucket='{{ dag.dag_id }}', description='{{ dag.dag_id }}', object_conditions={'exclude_prefixes': ['{{ dag.dag_id }}']}, gcp_conn_id='{{ dag.dag_id }}', task_id=TASK_ID, dag=self.dag, ) ti = TaskInstance(op, DEFAULT_DATE) ti.render_templates() self.assertEqual(dag_id, getattr(op, 's3_bucket')) self.assertEqual(dag_id, getattr(op, 'gcs_bucket')) self.assertEqual(dag_id, getattr(op, 'description')) # pylint:disable=unsubscriptable-object self.assertEqual( dag_id, getattr(op, 'object_conditions')['exclude_prefixes'][0]) # pylint:enable=unsubscriptable-object self.assertEqual(dag_id, getattr(op, 'gcp_conn_id'))
def test_constructor(self): operator = S3ToGoogleCloudStorageTransferOperator( task_id=TASK_ID, s3_bucket=AWS_BUCKET_NAME, gcs_bucket=GCS_BUCKET_NAME, project_id=GCP_PROJECT_ID, description=DESCRIPTION, schedule=SCHEDULE_DICT, ) self.assertEqual(operator.task_id, TASK_ID) self.assertEqual(operator.s3_bucket, AWS_BUCKET_NAME) self.assertEqual(operator.gcs_bucket, GCS_BUCKET_NAME) self.assertEqual(operator.project_id, GCP_PROJECT_ID) self.assertEqual(operator.description, DESCRIPTION) self.assertEqual(operator.schedule, SCHEDULE_DICT)
def create_dag(dag, folder, default_dag_args=None): #S3 to GCS transfer s3_to_gcs = S3ToGoogleCloudStorageTransferOperator( dag=dag, task_id=f"s3_to_gcs_{folder}", s3_bucket=s3_bucket, gcs_bucket=gcs_bucket, description="Transfer unloaded data from S3", object_conditions={'include_prefixes': [folder]}, timeout=60, wait=1) # Arrange the DAG s3_to_gcs return dag
def test_execute(self, mock_aws_hook, mock_transfer_hook): mock_aws_hook.return_value.get_credentials.return_value = Credentials( TEST_AWS_ACCESS_KEY_ID, TEST_AWS_ACCESS_SECRET, None) operator = S3ToGoogleCloudStorageTransferOperator( task_id=TASK_ID, s3_bucket=AWS_BUCKET_NAME, gcs_bucket=GCS_BUCKET_NAME, description=DESCRIPTION, schedule=SCHEDULE_DICT, ) operator.execute(None) mock_transfer_hook.return_value.create_transfer_job.assert_called_once_with( body=VALID_TRANSFER_JOB_AWS_RAW) self.assertTrue( mock_transfer_hook.return_value.wait_for_transfer_job.called)
def test_templates(self, _): dag_id = 'test_dag_id' configuration.load_test_config() args = {'start_date': DEFAULT_DATE} self.dag = DAG(dag_id, default_args=args) op = S3ToGoogleCloudStorageTransferOperator( s3_bucket='{{ dag.dag_id }}', gcs_bucket='{{ dag.dag_id }}', description='{{ dag.dag_id }}', object_conditions={'exclude_prefixes': ['{{ dag.dag_id }}']}, gcp_conn_id='{{ dag.dag_id }}', task_id=TASK_ID, dag=self.dag, ) ti = TaskInstance(op, DEFAULT_DATE) ti.render_templates() self.assertEqual(dag_id, getattr(op, 's3_bucket')) self.assertEqual(dag_id, getattr(op, 'gcs_bucket')) self.assertEqual(dag_id, getattr(op, 'description')) self.assertEqual(dag_id, getattr(op, 'object_conditions')['exclude_prefixes'][0]) self.assertEqual(dag_id, getattr(op, 'gcp_conn_id'))
dataset = "socorro_crash" dataset_version = "v2" date_submission_col = "crash_date" objects_prefix = "{}/{}/{}={}".format(dataset, dataset_version, date_submission_col, "{{ ds_nodash }}") # copy json crashstats from s3 to gcs s3_to_gcs = S3ToGoogleCloudStorageTransferOperator( task_id="s3_to_gcs", s3_bucket="crashstats-telemetry-crashes-prod-us-west-2", gcs_bucket=gcs_data_bucket, description="socorro crash report copy from s3 to gcs", aws_conn_id=read_aws_conn_id, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, object_conditions={"includePrefixes": "v1/crash_report/{{ ds_nodash }}"}, transfer_options={"deleteObjectsUniqueInSink": True}, timeout=3600, dag=dag, ) # Spark job reads gcs json and writes gcs parquet crash_report_parquet = SubDagOperator( task_id="crash_report_parquet", dag=dag, subdag=moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name="crash_report_parquet", default_args=default_args,
'--location=US', 'load', '--source_format=CSV', '--skip_leading_rows=0', '--replace', "--field_delimiter=\001", 'blpadi.adi_dimensional_by_date${{ ds_nodash }}', 'gs://moz-fx-data-derived-datasets-blpadi/blpadi/{{ ds }}/*', ] s3_to_gcs = S3ToGoogleCloudStorageTransferOperator( task_id='s3_to_gcs', s3_bucket='net-mozaws-data-us-west-2-data-analysis', gcs_bucket='moz-fx-data-derived-datasets-blpadi', description='blpadi copy from s3 to gcs', aws_conn_id='aws_data_iam_blpadi', gcp_conn_id=gcp_conn_id, project_id=connection.project_id, object_conditions=gcstj_object_conditions, transfer_options=gcstj_transfer_options, timeout=720, dag=blp_dag) load_blpadi_to_bq = GKEPodOperator(task_id='bigquery_load', name='load-blpadi-to-bq', image='google/cloud-sdk:242.0.0-alpine', arguments=bq_args, dag=blp_dag) blp_logs.set_downstream(blp_job_sensor) blp_job_sensor.set_downstream(s3_to_gcs) s3_to_gcs.set_downstream(load_blpadi_to_bq)
def load_to_bigquery(parent_dag_name=None, default_args=None, dataset_s3_bucket=None, aws_conn_id=None, dataset=None, dataset_version=None, gke_cluster_name=None, date_submission_col='submission_date_s3', ds_type='ds_nodash', dag_name='load_to_bigquery', gke_location='us-central1-a', gke_namespace='default', docker_image='docker.io/mozilla/parquet2bigquery:20191017', # noqa reprocess=False, p2b_concurrency='10', p2b_resume=False, p2b_table_alias=None, objects_prefix=None, spark_gs_dataset_location=None, bigquery_dataset='telemetry', dataset_gcs_bucket='moz-fx-data-derived-datasets-parquet', gcp_conn_id='google_cloud_derived_datasets', cluster_by=(), drop=(), rename={}, replace=()): """ Load Parquet data into BigQuery. Used with SubDagOperator. We use S3ToGoogleCloudStorageTransferOperator to create a GCS Transfer Service job to transfer the AWS S3 parquet data into a GCS Bucket. Once that is completed we launch a Kubernates pod on a existing GKE cluster using the GKEPodOperator. :param str parent_dag_name: parent dag name :param dict default_args: dag configuration :param str dataset_s3_bucket: source S3 Bucket :param str dataset_gcs_bucket: destination GCS Bucket :param str aws_conn_id: airflow connection id for S3 access :param str gcp_conn_id: airflow connection id for GCP access :param str dataset: dataset name :param str dataset_version: dataset version :param str date_submission_col: dataset date submission column :param str ds_type: dataset format (ds or ds_nodash) :param str gke_location: GKE cluster zone :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use for GKE pod operations # noqa :param str bigquery_dataset: bigquery load destination dataset :param str p2b_concurrency: number of processes for parquet2bigquery load :param str p2b_table_alias: override p2b table name with alias :param str p2b_resume allow resume support. defaults to False :param bool reprocess: enable dataset reprocessing defaults to False :param str objects_prefix: custom objects_prefix to override defaults :param str spark_gs_dataset_location: custom spark dataset load location to override defaults :param List[str] cluster_by: top level fields to cluster by when creating destination table :param List[str] drop: top level fields to exclude from destination table :param Dict[str, str] rename: top level fields to rename in destination table :param List[str] replace: top level field replacement expressions :return airflow.models.DAG """ connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) _dag_name = '{}.{}'.format(parent_dag_name, dag_name) if objects_prefix: _objects_prefix = objects_prefix else: _objects_prefix = '{}/{}/{}={{{{{}}}}}'.format(dataset, dataset_version, date_submission_col, ds_type) gcs_buckets = { 'transfer': dataset_gcs_bucket, 'load': dataset_gcs_bucket, } gcstj_object_conditions = { 'includePrefixes': _objects_prefix } gcstj_transfer_options = { 'deleteObjectsUniqueInSink': True } gke_args = [ '-d', bigquery_dataset, '-c', p2b_concurrency, '-b', gcs_buckets['load'], ] if not p2b_resume: gke_args += ['-R'] if p2b_table_alias: gke_args += ['-a', p2b_table_alias] if reprocess: reprocess_objects_prefix = _objects_prefix.replace('_nodash', '') gcs_buckets['transfer'] += '-tmp' gke_args += ['-p', reprocess_objects_prefix] else: gke_args += ['-p', _objects_prefix] if cluster_by: gke_args += ['--cluster-by'] + cluster_by if drop: gke_args += ['--drop'] + drop if rename: gke_args += ['--rename'] + [k + "=" + v for k, v in rename.items()] if replace: gke_args += ['--replace'] + replace bq_table_name = p2b_table_alias or normalize_table_id('_'.join([dataset, dataset_version])) with models.DAG(_dag_name, default_args=default_args) as dag: if dataset_s3_bucket is not None: s3_to_gcs = S3ToGoogleCloudStorageTransferOperator( task_id='s3_to_gcs', s3_bucket=dataset_s3_bucket, gcs_bucket=gcs_buckets['transfer'], description=_objects_prefix, aws_conn_id=aws_conn_id, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, object_conditions=gcstj_object_conditions, transfer_options=gcstj_transfer_options, timeout=3600, ) else: s3_to_gcs = DummyOperator(task_id='no_s3_to_gcs') reprocess = SubDagOperator( subdag=reprocess_parquet( _dag_name, default_args, reprocess, gcp_conn_id, gcs_buckets, _objects_prefix, date_submission_col, dataset, dataset_version, gs_dataset_location=spark_gs_dataset_location), task_id='reprocess_parquet') remove_bq_table = BigQueryTableDeleteOperator( task_id='remove_bq_table', bigquery_conn_id=gcp_conn_id, deletion_dataset_table='{}.{}${{{{ds_nodash}}}}'.format(bigquery_dataset, bq_table_name), # noqa ignore_if_missing=True, ) bulk_load = GKEPodOperator( task_id='bigquery_load', gcp_conn_id=gcp_conn_id, project_id=connection.project_id, location=gke_location, cluster_name=gke_cluster_name, name=_dag_name.replace('_', '-'), namespace=gke_namespace, image=docker_image, arguments=gke_args, ) s3_to_gcs >> reprocess >> remove_bq_table >> bulk_load return dag
'schedule_interval': None, 'email': '*****@*****.**' } #Read Airflow Variable config = Variable.get("bioInfo_s3_to_gcs_config", deserialize_json=True) #AWS Variables s3_bucket = config["s3_bucket"] #GCS Variables gcs_bucket = config["gcs_bucket"] gcs_include_prefix = '{{dag_run.conf["gcs_include_prefix"]}}' #Start Tasks with models.DAG('s3_to_gcs_prefix', max_active_runs=1, default_args=default_dag_args) as dag: s3_to_gcs = S3ToGoogleCloudStorageTransferOperator( task_id='s3_to_gcs', s3_bucket=s3_bucket, gcs_bucket=gcs_bucket, description="Transfer unloaded data from S3", object_conditions={'include_prefixes': [gcs_include_prefix]}, timeout=60, wait=1) #Dag creation s3_to_gcs