def test_execute(self, mock_hook): task = GCSSynchronizeBucketsOperator( task_id="task-id", source_bucket="SOURCE_BUCKET", destination_bucket="DESTINATION_BUCKET", source_object="SOURCE_OBJECT", destination_object="DESTINATION_OBJECT", recursive=True, delete_extra_files=True, allow_overwrite=True, gcp_conn_id="GCP_CONN_ID", delegate_to="DELEGATE_TO", impersonation_chain=IMPERSONATION_CHAIN, ) task.execute({}) mock_hook.assert_called_once_with( google_cloud_storage_conn_id='GCP_CONN_ID', delegate_to='DELEGATE_TO', impersonation_chain=IMPERSONATION_CHAIN, ) mock_hook.return_value.sync.assert_called_once_with( source_bucket='SOURCE_BUCKET', source_object='SOURCE_OBJECT', destination_bucket='DESTINATION_BUCKET', destination_object='DESTINATION_OBJECT', delete_extra_files=True, recursive=True, allow_overwrite=True, )
BUCKET_2_SRC = os.environ.get("GCP_GCS_BUCKET_2_SRC", "test-gcs-sync-2-src") BUCKET_2_DST = os.environ.get("GCP_GCS_BUCKET_2_DST", "test-gcs-sync-2-dst") BUCKET_3_SRC = os.environ.get("GCP_GCS_BUCKET_3_SRC", "test-gcs-sync-3-src") BUCKET_3_DST = os.environ.get("GCP_GCS_BUCKET_3_DST", "test-gcs-sync-3-dst") OBJECT_1 = os.environ.get("GCP_GCS_OBJECT_1", "test-gcs-to-gcs-1") OBJECT_2 = os.environ.get("GCP_GCS_OBJECT_2", "test-gcs-to-gcs-2") with models.DAG("example_gcs_to_gcs", schedule_interval='@once', start_date=days_ago(1), tags=['example']) as dag: # [START howto_synch_bucket] sync_bucket = GCSSynchronizeBucketsOperator( task_id="sync_bucket", source_bucket=BUCKET_1_SRC, destination_bucket=BUCKET_1_DST) # [END howto_synch_bucket] # [START howto_synch_full_bucket] sync_full_bucket = GCSSynchronizeBucketsOperator( task_id="sync_full_bucket", source_bucket=BUCKET_1_SRC, destination_bucket=BUCKET_1_DST, delete_extra_files=True, allow_overwrite=True, ) # [END howto_synch_full_bucket] # [START howto_synch_to_subdir] sync_to_subdirectory = GCSSynchronizeBucketsOperator(
GCP_BQ_TABLE_AUTHORS = os.environ['GCP_BQ_TABLE_AUTHORS'] GCP_BQ_TABLE_REVIEWS = os.environ['GCP_BQ_TABLE_REVIEWS'] GCP_BQ_TABLE_BOOKS = os.environ['GCP_BQ_TABLE_BOOKS'] with DAG(dag_id='etl_pipeline', schedule_interval=None, start_date=days_ago(200), default_args={}) as dag: grab_data = PythonOperator(task_id='grab_data', python_callable=create_source_from_airflow) transfer_grabbed_data = GCSSynchronizeBucketsOperator( task_id='transfer_grabbed_data', source_bucket=os.environ['GCP_GCS_BUCKET_LANDING'], destination_bucket=os.environ['GCP_GCS_BUCKET_WORKING'], gcp_conn_id='gr_storage_conn', source_object='{{ run_id }}', destination_object='{{ run_id }}', ) spark_etl = DataprocInstantiateWorkflowTemplateOperator( task_id='spark_etl', template_id=os.environ['GCP_DATAPROC_TEMPLATE_ID'], project_id=os.environ['GCP_PROJECT_ID'], region=os.environ['GCP_REGION'], parameters={ 'PATH_TO_ETL_FILE': f"gs://{os.environ['GCP_GCS_BUCKET_ROUTINE']}/etl/etl.py", 'CLUSTER_NAME': 'goodreads-etl', 'ARG_SOURCE_BUCKET': os.environ['GCP_GCS_BUCKET_WORKING'], 'ARG_DESTINATION_BUCKET': os.environ['GCP_GCS_BUCKET_PROCESSED'],