def test_executes_with_empty_source_objects(self, mock_hook): operator = GCSToGCSOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_objects=SOURCE_OBJECTS_NO_FILE ) operator.execute(None) mock_hook.return_value.list.assert_called_once_with(TEST_BUCKET, prefix='', delimiter=None)
def test_raises_exception_with_two_empty_list_inside_source_objects(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_OBJECTS_LIST operator = GCSToGCSOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_objects=SOURCE_OBJECTS_TWO_EMPTY_STRING ) with pytest.raises(AirflowException, match="You can't have two empty strings inside source_object"): operator.execute(None)
def test_execute_no_suffix(self, mock_hook): operator = GCSToGCSOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_WILDCARD_SUFFIX, destination_bucket=DESTINATION_BUCKET, ) operator.execute(None) mock_hook.return_value.list.assert_called_once_with(TEST_BUCKET, prefix="test_object", delimiter="")
def test_no_prefix_with_last_modified_time_with_false_cond(self, mock_hook): mock_hook.return_value.is_updated_after.return_value = False operator = GCSToGCSOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_NO_WILDCARD, destination_bucket=DESTINATION_BUCKET, destination_object=SOURCE_OBJECT_NO_WILDCARD, last_modified_time=MOD_TIME_1) operator.execute(None) mock_hook.return_value.rewrite.assert_not_called()
def test_execute_no_prefix_with_no_last_modified_time(self, mock_hook): operator = GCSToGCSOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_NO_WILDCARD, destination_bucket=DESTINATION_BUCKET, destination_object=SOURCE_OBJECT_NO_WILDCARD, last_modified_time=None) operator.execute(None) mock_hook.return_value.rewrite.assert_called_once_with( TEST_BUCKET, 'test_object.txt', DESTINATION_BUCKET, 'test_object.txt')
def test_no_prefix_with_maximum_modified_time_with_true_cond(self, mock_hook): mock_hook.return_value.is_updated_before.return_value = True operator = GCSToGCSOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_NO_WILDCARD, destination_bucket=DESTINATION_BUCKET, destination_object=SOURCE_OBJECT_NO_WILDCARD, maximum_modified_time=MOD_TIME_1) operator.execute(None) mock_hook.return_value.rewrite.assert_called_once_with( TEST_BUCKET, 'test_object.txt', DESTINATION_BUCKET, 'test_object.txt')
def test_executes_with_multiple_items_in_source_objects(self, mock_hook): operator = GCSToGCSOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_objects=SOURCE_OBJECTS_MULTIPLE_FILES ) operator.execute(None) mock_hook.return_value.list.assert_has_calls( [ mock.call(TEST_BUCKET, prefix='test_object/file1.txt', delimiter=None), mock.call(TEST_BUCKET, prefix='test_object/file2.txt', delimiter=None), ], any_order=True, )
def test_executes_with_no_destination_bucket_and_no_destination_object(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_OBJECTS_LIST operator = GCSToGCSOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_objects=SOURCE_OBJECTS_LIST ) operator.execute(None) mock_calls = [ mock.call(TEST_BUCKET, 'test_object/file1.txt', TEST_BUCKET, 'test_object/file1.txt'), mock.call(TEST_BUCKET, 'test_object/file2.txt', TEST_BUCKET, 'test_object/file2.txt'), mock.call(TEST_BUCKET, 'test_object/file3.json', TEST_BUCKET, 'test_object/file3.json'), ] mock_hook.return_value.rewrite.assert_has_calls(mock_calls)
def test_wc_with_last_modified_time_with_one_true_cond(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST mock_hook.return_value.is_updated_after.side_effect = [True, False, False] operator = GCSToGCSOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_WILDCARD_FILENAME, destination_bucket=DESTINATION_BUCKET, last_modified_time=MOD_TIME_1) operator.execute(None) mock_hook.return_value.rewrite.assert_called_once_with( TEST_BUCKET, 'test_object/file1.txt', DESTINATION_BUCKET, 'test_object/file1.txt')
def test_executes_with_is_older_than_with_true_cond(self, mock_hook): mock_hook.return_value.is_older_than.return_value = True operator = GCSToGCSOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_NO_WILDCARD, destination_bucket=DESTINATION_BUCKET, destination_object=SOURCE_OBJECT_NO_WILDCARD, last_modified_time=MOD_TIME_1, maximum_modified_time=MOD_TIME_2, is_older_than=3600) operator.execute(None) mock_hook.return_value.rewrite.assert_called_once_with( TEST_BUCKET, 'test_object.txt', DESTINATION_BUCKET, 'test_object.txt')
def test_execute_wildcard_with_replace_flag_false(self, mock_hook): operator = GCSToGCSOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_WILDCARD_SUFFIX, destination_bucket=DESTINATION_BUCKET, replace=False) operator.execute(None) mock_calls = [ mock.call(TEST_BUCKET, prefix="test_object", delimiter=""), mock.call(DESTINATION_BUCKET, prefix="test_object", delimiter=""), ] mock_hook.return_value.list.assert_has_calls(mock_calls)
def test_execute_with_empty_destination_bucket(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST operator = GCSToGCSOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_NO_WILDCARD, destination_bucket=None, destination_object=DESTINATION_OBJECT_PREFIX) with mock.patch.object(operator.log, 'warning') as mock_warn: operator.execute(None) mock_warn.assert_called_once_with( 'destination_bucket is None. Defaulting it to source_bucket (%s)', TEST_BUCKET ) self.assertEqual(operator.destination_bucket, operator.source_bucket)
def test_execute_more_than_1_wildcard(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST operator = GCSToGCSOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_MULTIPLE_WILDCARDS, destination_bucket=DESTINATION_BUCKET, destination_object=DESTINATION_OBJECT_PREFIX) total_wildcards = operator.source_object.count(WILDCARD) error_msg = "Only one wildcard '[*]' is allowed in source_object parameter. " \ "Found {}".format(total_wildcards) with self.assertRaisesRegex(AirflowException, error_msg): operator.execute(None)
def test_execute_wildcard_without_destination_object(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST operator = GCSToGCSOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_WILDCARD_FILENAME, destination_bucket=DESTINATION_BUCKET, ) operator.execute(None) mock_calls_none = [ mock.call(TEST_BUCKET, 'test_object/file1.txt', DESTINATION_BUCKET, 'test_object/file1.txt'), mock.call(TEST_BUCKET, 'test_object/file2.txt', DESTINATION_BUCKET, 'test_object/file2.txt'), ] mock_hook.return_value.rewrite.assert_has_calls(mock_calls_none)
def test_executes_with_delimiter_and_destination_object(self, mock_hook): mock_hook.return_value.list.return_value = ['test_object/file3.json'] operator = GCSToGCSOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_objects=SOURCE_OBJECTS_LIST, destination_bucket=DESTINATION_BUCKET, destination_object=DESTINATION_OBJECT, delimiter=DELIMITER) operator.execute(None) mock_calls = [ mock.call(TEST_BUCKET, 'test_object/file3.json', DESTINATION_BUCKET, DESTINATION_OBJECT), ] mock_hook.return_value.rewrite.assert_has_calls(mock_calls)
def test_wc_with_last_modified_time_with_all_true_cond_no_file(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_OBJECTS_LIST mock_hook.return_value.is_updated_after.side_effect = [True, True, True] operator = GCSToGCSOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_objects=SOURCE_OBJECTS_NO_FILE, destination_bucket=DESTINATION_BUCKET, last_modified_time=MOD_TIME_1, ) operator.execute(None) mock_calls_none = [ mock.call(TEST_BUCKET, 'test_object/file1.txt', DESTINATION_BUCKET, 'test_object/file1.txt'), mock.call(TEST_BUCKET, 'test_object/file2.txt', DESTINATION_BUCKET, 'test_object/file2.txt'), mock.call(TEST_BUCKET, 'test_object/file3.json', DESTINATION_BUCKET, 'test_object/file3.json'), ] mock_hook.return_value.rewrite.assert_has_calls(mock_calls_none)
def test_execute_wildcard_with_destination_object_retained_prefix(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST operator = GCSToGCSOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_WILDCARD_FILENAME, destination_bucket=DESTINATION_BUCKET, destination_object='{}/{}'.format(DESTINATION_OBJECT_PREFIX, SOURCE_OBJECT_WILDCARD_SUFFIX[:-1]) ) operator.execute(None) mock_calls_retained = [ mock.call(TEST_BUCKET, 'test_object/file1.txt', DESTINATION_BUCKET, 'foo/bar/test_object/file1.txt'), mock.call(TEST_BUCKET, 'test_object/file2.txt', DESTINATION_BUCKET, 'foo/bar/test_object/file2.txt'), ] mock_hook.return_value.rewrite.assert_has_calls(mock_calls_retained)
""" The airflow DAG to backup Zarr Note that this docstring must contain the strings "airflow" and "DAG" for Airflow to properly detect it as a DAG See: http://bit.ly/307VMum See here for documentation on GCS Airflow Operators: https://airflow.apache.org/docs/apache-airflow-providers-google/stable/_modules/airflow/providers/google/cloud/example_dags/example_gcs_to_gcs.html """ import os from airflow import models from airflow.providers.google.cloud.operators.gcs import GCSSynchronizeBucketsOperator from airflow.providers.google.cloud.transfers.gcs_to_gcs import GCSToGCSOperator from airflow.utils.dates import days_ago # Zarr directory BUCKET_1_SRC = ( "solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/OSGB36/all_zarr_int16" ) copy_files_with_wildcard = GCSToGCSOperator( task_id="copy_files_with_wildcard", source_bucket=BUCKET_1_SRC, source_object="data/*.txt", destination_bucket=BUCKET_1_DST, destination_object="backup/", )
from __future__ import print_function from airflow import models from datetime import datetime from airflow.providers.google.cloud.transfers.gcs_to_gcs import GCSToGCSOperator default_dag_args = { 'start_date': datetime(2021, 3, 18), 'owner': 'File transfer GCS to GCS' } with models.DAG( 'file_transfer_gcs_to_gcs', schedule_interval=None, default_args=default_dag_args) as dag: copy_single_file = GCSToGCSOperator( task_id='copy_single_file', source_bucket='southamerica-east1-poc-airf-904b2db6-bucket', source_objects=['dags/airflow_monitoring.py'], destination_bucket='trigger-bucket-poc', destination_object='copied_file/airflow_monitoring.py', )
) # [END howto_operator_gcs_object_create_acl_entry_task] # [START howto_operator_gcs_download_file_task] download_file = GCSToLocalFilesystemOperator( task_id="download_file", object_name=BUCKET_FILE_LOCATION, bucket=BUCKET_1, filename=PATH_TO_SAVED_FILE, ) # [END howto_operator_gcs_download_file_task] copy_file = GCSToGCSOperator( task_id="copy_file", source_bucket=BUCKET_1, source_object=BUCKET_FILE_LOCATION, destination_bucket=BUCKET_2, destination_object=BUCKET_FILE_LOCATION, ) delete_files = GCSDeleteObjectsOperator( task_id="delete_files", bucket_name=BUCKET_1, objects=[BUCKET_FILE_LOCATION] ) # [START howto_operator_gcs_delete_bucket] delete_bucket_1 = GCSDeleteBucketOperator(task_id="delete_bucket_1", bucket_name=BUCKET_1) delete_bucket_2 = GCSDeleteBucketOperator(task_id="delete_bucket_2", bucket_name=BUCKET_2) # [END howto_operator_gcs_delete_bucket] [create_bucket1, create_bucket2] >> list_buckets >> list_buckets_result [create_bucket1, create_bucket2] >> upload_file
# [START howto_sync_from_subdir] sync_from_subdirectory = GCSSynchronizeBucketsOperator( task_id="sync_from_subdirectory", source_bucket=BUCKET_1_SRC, source_object="subdir/", destination_bucket=BUCKET_1_DST, ) # [END howto_sync_from_subdir] # [START howto_operator_gcs_to_gcs_single_file] copy_single_file = GCSToGCSOperator( task_id="copy_single_gcs_file", source_bucket=BUCKET_1_SRC, source_object=OBJECT_1, destination_bucket= BUCKET_1_DST, # If not supplied the source_bucket value will be used destination_object="backup_" + OBJECT_1, # If not supplied the source_object value will be used ) # [END howto_operator_gcs_to_gcs_single_file] # [START howto_operator_gcs_to_gcs_wildcard] copy_files_with_wildcard = GCSToGCSOperator( task_id="copy_files_with_wildcard", source_bucket=BUCKET_1_SRC, source_object="data/*.txt", destination_bucket=BUCKET_1_DST, destination_object="backup/", ) # [END howto_operator_gcs_to_gcs_wildcard]