def test_execute(self, gcs_mock_hook, s3_one_mock_hook, s3_two_mock_hook): """Test the execute function when the run is successful.""" operator = S3ToGoogleCloudStorageOperator(task_id=TASK_ID, bucket=S3_BUCKET, prefix=S3_PREFIX, delimiter=S3_DELIMITER, dest_gcs_conn_id=GCS_CONN_ID, dest_gcs=GCS_PATH_PREFIX) s3_one_mock_hook.return_value.list_keys.return_value = MOCK_FILES s3_two_mock_hook.return_value.list_keys.return_value = MOCK_FILES uploaded_files = operator.execute(None) gcs_mock_hook.return_value.upload.assert_has_calls([ mock.call('gcs-bucket', 'data/TEST1.csv', mock.ANY, gzip=False), mock.call('gcs-bucket', 'data/TEST3.csv', mock.ANY, gzip=False), mock.call('gcs-bucket', 'data/TEST2.csv', mock.ANY, gzip=False) ], any_order=True) s3_one_mock_hook.assert_called_once_with(aws_conn_id=AWS_CONN_ID, verify=None) s3_two_mock_hook.assert_called_once_with(aws_conn_id=AWS_CONN_ID, verify=None) gcs_mock_hook.assert_called_once_with( google_cloud_storage_conn_id=GCS_CONN_ID, delegate_to=None) # we expect MOCK_FILES to be uploaded self.assertEqual(sorted(MOCK_FILES), sorted(uploaded_files))
def test_execute(self, gcs_mock_hook, s3_one_mock_hook, s3_two_mock_hook): """Test the execute function when the run is successful.""" operator = S3ToGoogleCloudStorageOperator(task_id=TASK_ID, bucket=S3_BUCKET, prefix=S3_PREFIX, delimiter=S3_DELIMITER, dest_gcs_conn_id=GCS_CONN_ID, dest_gcs=GCS_PATH_PREFIX) s3_one_mock_hook.return_value.list_keys.return_value = MOCK_FILES s3_two_mock_hook.return_value.list_keys.return_value = MOCK_FILES def _assert_upload(bucket, object, tmp_filename): gcs_bucket, gcs_object_path = _parse_gcs_url(GCS_PATH_PREFIX) self.assertEqual(gcs_bucket, bucket) self.assertIn(object[len(gcs_object_path):], MOCK_FILES) gcs_mock_hook.return_value.upload.side_effect = _assert_upload uploaded_files = operator.execute(None) s3_one_mock_hook.assert_called_once_with(aws_conn_id=AWS_CONN_ID, verify=None) s3_two_mock_hook.assert_called_once_with(aws_conn_id=AWS_CONN_ID, verify=None) gcs_mock_hook.assert_called_once_with( google_cloud_storage_conn_id=GCS_CONN_ID, delegate_to=None) # we expect MOCK_FILES to be uploaded self.assertEqual(sorted(MOCK_FILES), sorted(uploaded_files))
def test_init(self): """Test S3ToGoogleCloudStorageOperator instance is properly initialized.""" operator = S3ToGoogleCloudStorageOperator(task_id=TASK_ID, bucket=S3_BUCKET, prefix=S3_PREFIX, delimiter=S3_DELIMITER, gcp_conn_id=GCS_CONN_ID, dest_gcs=GCS_PATH_PREFIX) self.assertEqual(operator.task_id, TASK_ID) self.assertEqual(operator.bucket, S3_BUCKET) self.assertEqual(operator.prefix, S3_PREFIX) self.assertEqual(operator.delimiter, S3_DELIMITER) self.assertEqual(operator.gcp_conn_id, GCS_CONN_ID) self.assertEqual(operator.dest_gcs, GCS_PATH_PREFIX)
def test_execute_with_gzip(self, gcs_mock_hook, s3_one_mock_hook, s3_two_mock_hook): """Test the execute function when the run is successful.""" operator = S3ToGoogleCloudStorageOperator(task_id=TASK_ID, bucket=S3_BUCKET, prefix=S3_PREFIX, delimiter=S3_DELIMITER, dest_gcs_conn_id=GCS_CONN_ID, dest_gcs=GCS_PATH_PREFIX, gzip=True) s3_one_mock_hook.return_value.list_keys.return_value = MOCK_FILES s3_two_mock_hook.return_value.list_keys.return_value = MOCK_FILES operator.execute(None) gcs_mock_hook.return_value.upload.assert_has_calls([ mock.call('gcs-bucket', 'data/TEST2.csv', mock.ANY, gzip=True), mock.call('gcs-bucket', 'data/TEST1.csv', mock.ANY, gzip=True), mock.call('gcs-bucket', 'data/TEST3.csv', mock.ANY, gzip=True) ], any_order=True)
start = DummyOperator( task_id='start', trigger_rule='all_success' ) end = DummyOperator( task_id='end', trigger_rule='one_success' ) # Task to copy files from S3 to GCS s3_email_to_gcs = S3ToGoogleCloudStorageOperator( task_id='s3_to_gcs', bucket= S3_BUCKET, aws_conn_id=AWS_CONNECTION, dest_gcs_conn_id= GCS_BUCKET_CONNECTION, dest_gcs=GCS_BUCKET_NAME_S3_TASK ) s3_email_to_gcs.set_upstream(start) # Task to load files from GCS into BQ gcs_to_bq_task = GoogleCloudStorageToBigQueryOperator( task_id='gcs_to_bq', bucket=GCS_BUCKET_NAME_BQ_TASK, source_objects= ['*.parquet'], # Assumes we're loading parquet files, other file types are supported destination_project_dataset_table=DESTINATION_TABLE, source_format='parquet', skip_leading_rows=1, max_bad_records=10, bigquery_conn_id= BQ_CONNECTION,