def test_s3_delete_prefix(self): bucket = "testbucket" key_pattern = "path/data" n_keys = 3 keys = [key_pattern + str(i) for i in range(n_keys)] conn = boto3.client('s3') conn.create_bucket(Bucket=bucket) for k in keys: conn.upload_fileobj(Bucket=bucket, Key=k, Fileobj=io.BytesIO(b"input")) # The objects should be detected before the DELETE action is taken objects_in_dest_bucket = conn.list_objects(Bucket=bucket, Prefix=key_pattern) assert len(objects_in_dest_bucket['Contents']) == n_keys assert sorted( x['Key'] for x in objects_in_dest_bucket['Contents']) == sorted(keys) op = S3DeleteObjectsOperator(task_id="test_task_s3_delete_prefix", bucket=bucket, prefix=key_pattern) op.execute(None) # There should be no object found in the bucket created earlier assert 'Contents' not in conn.list_objects(Bucket=bucket, Prefix=key_pattern)
def test_s3_delete_single_object(self): bucket = "testbucket" key = "path/data.txt" conn = boto3.client('s3') conn.create_bucket(Bucket=bucket) conn.upload_fileobj(Bucket=bucket, Key=key, Fileobj=io.BytesIO(b"input")) # The object should be detected before the DELETE action is taken objects_in_dest_bucket = conn.list_objects(Bucket=bucket, Prefix=key) self.assertEqual(len(objects_in_dest_bucket['Contents']), 1) self.assertEqual(objects_in_dest_bucket['Contents'][0]['Key'], key) op = S3DeleteObjectsOperator(task_id="test_task_s3_delete_single_object", bucket=bucket, keys=key) op.execute(None) # There should be no object found in the bucket created earlier self.assertFalse('Contents' in conn.list_objects(Bucket=bucket, Prefix=key))
salesforce_query= "SELECT Id, Name, Company, Phone, Email, LastModifiedDate, IsActive FROM Customers", s3_bucket_name="landing-bucket", s3_key=f"{BASE_PATH}/{FILE_NAME}", salesforce_conn_id="salesforce", replace=True, ) # [END howto_operator_salesforce_to_s3_transfer] date_prefixes = "{{ execution_date.strftime('%Y/%m/%d') }}" store_to_s3_data_lake = S3CopyObjectOperator( task_id="store_to_s3_data_lake", source_bucket_key=upload_salesforce_data_to_s3_landing. output["s3_uri"], dest_bucket_name="data_lake", dest_bucket_key=f"{BASE_PATH}/{date_prefixes}/{FILE_NAME}", ) delete_data_from_s3_landing = S3DeleteObjectsOperator( task_id="delete_data_from_s3_landing", bucket=upload_salesforce_data_to_s3_landing.output["s3_bucket_name"], keys=upload_salesforce_data_to_s3_landing.output["s3_key"], ) store_to_s3_data_lake >> delete_data_from_s3_landing # Task dependencies created via `XComArgs`: # upload_salesforce_data_to_s3_landing >> store_to_s3_data_lake # upload_salesforce_data_to_s3_landing >> delete_data_from_s3_landing
'retry_delay': timedelta(minutes=10) } with DAG('minio-fifa-spark-operator', default_args=default_args, schedule_interval='@daily', tags=['development', 's3', 'minio', 'spark-operator']) as dag: etl_fifa_spark_operator = SparkKubernetesOperator( task_id='etl_fifa_spark_operator', namespace='processing', application_file='etl-fifa.yaml', kubernetes_conn_id='minikube', do_xcom_push=True) monitor_spark_app_status = SparkKubernetesSensor( task_id='monitor_spark_app_status', namespace="processing", application_name= "{{ task_instance.xcom_pull(task_ids='etl_fifa_spark_operator')['metadata']['name'] }}", kubernetes_conn_id="minikube") delete_s3_file_raw_zone = S3DeleteObjectsOperator( task_id='delete_s3_file_raw_zone', bucket=RAW_ZONE, keys='data.csv', aws_conn_id='minio', do_xcom_push=True) etl_fifa_spark_operator >> monitor_spark_app_status >> delete_s3_file_raw_zone