def test_s3_copy_object_arg_combination_1(self):
        conn = boto3.client('s3')
        conn.create_bucket(Bucket=self.source_bucket)
        conn.create_bucket(Bucket=self.dest_bucket)
        conn.upload_fileobj(Bucket=self.source_bucket,
                            Key=self.source_key,
                            Fileobj=io.BytesIO(b"input"))

        # there should be nothing found before S3CopyObjectOperator is executed
        self.assertFalse('Contents' in conn.list_objects(
            Bucket=self.dest_bucket, Prefix=self.dest_key))

        op = S3CopyObjectOperator(
            task_id="test_task_s3_copy_object",
            source_bucket_key=self.source_key,
            source_bucket_name=self.source_bucket,
            dest_bucket_key=self.dest_key,
            dest_bucket_name=self.dest_bucket,
        )
        op.execute(None)

        objects_in_dest_bucket = conn.list_objects(Bucket=self.dest_bucket,
                                                   Prefix=self.dest_key)
        # there should be object found, and there should only be one object found
        self.assertEqual(len(objects_in_dest_bucket['Contents']), 1)
        # the object found should be consistent with dest_key specified earlier
        self.assertEqual(objects_in_dest_bucket['Contents'][0]['Key'],
                         self.dest_key)
    def test_s3_copy_object_arg_combination_2(self):
        conn = boto3.client('s3')
        conn.create_bucket(Bucket=self.source_bucket)
        conn.create_bucket(Bucket=self.dest_bucket)
        conn.upload_fileobj(Bucket=self.source_bucket,
                            Key=self.source_key,
                            Fileobj=io.BytesIO(b"input"))

        # there should be nothing found before S3CopyObjectOperator is executed
        assert 'Contents' not in conn.list_objects(Bucket=self.dest_bucket,
                                                   Prefix=self.dest_key)

        source_key_s3_url = f"s3://{self.source_bucket}/{self.source_key}"
        dest_key_s3_url = f"s3://{self.dest_bucket}/{self.dest_key}"
        op = S3CopyObjectOperator(
            task_id="test_task_s3_copy_object",
            source_bucket_key=source_key_s3_url,
            dest_bucket_key=dest_key_s3_url,
        )
        op.execute(None)

        objects_in_dest_bucket = conn.list_objects(Bucket=self.dest_bucket,
                                                   Prefix=self.dest_key)
        # there should be object found, and there should only be one object found
        assert len(objects_in_dest_bucket['Contents']) == 1
        # the object found should be consistent with dest_key specified earlier
        assert objects_in_dest_bucket['Contents'][0]['Key'] == self.dest_key
Exemple #3
0
        task_id="upload_salesforce_data_to_s3",
        salesforce_query=
        "SELECT Id, Name, Company, Phone, Email, LastModifiedDate, IsActive FROM Customers",
        s3_bucket_name="landing-bucket",
        s3_key=f"{BASE_PATH}/{FILE_NAME}",
        salesforce_conn_id="salesforce",
        replace=True,
    )
    # [END howto_operator_salesforce_to_s3_transfer]

    date_prefixes = "{{ execution_date.strftime('%Y/%m/%d') }}"

    store_to_s3_data_lake = S3CopyObjectOperator(
        task_id="store_to_s3_data_lake",
        source_bucket_key=upload_salesforce_data_to_s3_landing.
        output["s3_uri"],
        dest_bucket_name="data_lake",
        dest_bucket_key=f"{BASE_PATH}/{date_prefixes}/{FILE_NAME}",
    )

    delete_data_from_s3_landing = S3DeleteObjectsOperator(
        task_id="delete_data_from_s3_landing",
        bucket=upload_salesforce_data_to_s3_landing.output["s3_bucket_name"],
        keys=upload_salesforce_data_to_s3_landing.output["s3_key"],
    )

    store_to_s3_data_lake >> delete_data_from_s3_landing

    # Task dependencies created via `XComArgs`:
    #   upload_salesforce_data_to_s3_landing >> store_to_s3_data_lake
    #   upload_salesforce_data_to_s3_landing >> delete_data_from_s3_landing
from airflow.providers.amazon.aws.operators.sagemaker_endpoint import (
    SageMakerEndpointOperator, )
from airflow.providers.amazon.aws.operators.sagemaker_training import (
    SageMakerTrainingOperator, )
from sagemaker.amazon.common import write_numpy_to_dense_tensor

dag = DAG(
    dag_id="chapter9_aws_handwritten_digit_classifier",
    schedule_interval=None,
    start_date=airflow.utils.dates.days_ago(3),
)

download_mnist_data = S3CopyObjectOperator(
    task_id="download_mnist_data",
    source_bucket_name="sagemaker-sample-data-eu-west-1",
    source_bucket_key="algorithms/kmeans/mnist/mnist.pkl.gz",
    dest_bucket_name="your-bucket",
    dest_bucket_key="mnist.pkl.gz",
    dag=dag,
)


def _extract_mnist_data():
    s3hook = S3Hook()

    # Download S3 dataset into memory
    mnist_buffer = io.BytesIO()
    mnist_obj = s3hook.get_key(bucket_name="your-bucket", key="mnist.pkl.gz")
    mnist_obj.download_fileobj(mnist_buffer)

    # Unpack gzip file, extract dataset, convert to dense tensor, upload back to S3
    mnist_buffer.seek(0)