Beispiel #1
0
class S3Bucket:
    def __init__(self, bucket, endpoint, id, key, region):
        self.bucket = bucket
        self.service_endpoint = endpoint
        self.aws_access_key_id = id
        self.aws_secret_access_key = key
        self.region_name = region
        self.client = boto3.client(
            's3',
            endpoint_url=self.service_endpoint,
            aws_access_key_id=self.aws_access_key_id,
            aws_secret_access_key=self.aws_secret_access_key,
            region_name=self.region_name)
        self.transfer = TransferManager(self.client, None, None, None)

    def manager_upload(self, file):
        self.transfer.upload(file, self.bucket, file[1:], None, None)

    def upload(self, file, filekey):
        exist = self.client.list_objects(Bucket=self.bucket,
                                         Prefix=filekey[1:])
        if exist:
            return False
        try:
            with open(file, 'rb') as f:
                self.client.upload_fileobj(f, self.bucket, filekey[1:])
        except Exception as ex:
            traceback.print_exc()
            raise
        return True
Beispiel #2
0
class S3Uploader(object):
    """
    Class to upload objects to S3 bucket that use versioning. If bucket
    does not already use versioning, this class will turn on versioning.
    """
    def __init__(self,
                 s3_client,
                 bucket_name,
                 region,
                 prefix=None,
                 kms_key_id=None,
                 force_upload=False,
                 transfer_manager=None):
        self.bucket_name = bucket_name
        self.prefix = prefix
        self.kms_key_id = kms_key_id or None
        self.force_upload = force_upload
        self.s3 = s3_client
        self.region = region

        self.transfer_manager = transfer_manager
        if not transfer_manager:
            self.transfer_manager = TransferManager(self.s3)

    def upload(self, file_name, remote_path):
        """
        Uploads given file to S3
        :param file_name: Path to the file that will be uploaded
        :param remote_path:  be uploaded
        :return: VersionId of the latest upload
        """

        if self.prefix and len(self.prefix) > 0:
            remote_path = "{0}/{1}".format(self.prefix, remote_path)

        # Check if a file with same data exists
        if not self.force_upload and self.file_exists(remote_path):
            LOG.debug("File with same data is already exists at {0}. "
                      "Skipping upload".format(remote_path))
            return self.make_url(remote_path)

        try:

            # Default to regular server-side encryption unless customer has
            # specified their own KMS keys
            additional_args = {"ServerSideEncryption": "AES256"}

            if self.kms_key_id:
                additional_args["ServerSideEncryption"] = "aws:kms"
                additional_args["SSEKMSKeyId"] = self.kms_key_id

            print_progress_callback = \
                ProgressPercentage(file_name, remote_path)
            future = self.transfer_manager.upload(file_name, self.bucket_name,
                                                  remote_path, additional_args,
                                                  [print_progress_callback])
            future.result()

            return self.make_url(remote_path)

        except botocore.exceptions.ClientError as ex:
            error_code = ex.response["Error"]["Code"]
            if error_code == "NoSuchBucket":
                raise exceptions.NoSuchBucketError(
                    bucket_name=self.bucket_name)
            raise ex

    def upload_with_dedup(self, file_name, extension=None):
        """
        Makes and returns name of the S3 object based on the file's MD5 sum

        :param file_name: file to upload
        :param extension: String of file extension to append to the object
        :return: S3 URL of the uploaded object
        """

        # This construction of remote_path is critical to preventing duplicate
        # uploads of same object. Uploader will check if the file exists in S3
        # and re-upload only if necessary. So the template points to same file
        # in multiple places, this will upload only once

        filemd5 = self.file_checksum(file_name)
        remote_path = filemd5
        if extension:
            remote_path = remote_path + "." + extension

        return self.upload(file_name, remote_path)

    def file_exists(self, remote_path):
        """
        Check if the file we are trying to upload already exists in S3

        :param remote_path:
        :return: True, if file exists. False, otherwise
        """

        try:
            # Find the object that matches this ETag
            self.s3.head_object(Bucket=self.bucket_name, Key=remote_path)
            return True
        except botocore.exceptions.ClientError:
            # Either File does not exist or we are unable to get
            # this information.
            return False

    def make_url(self, obj_path):
        return "s3://{0}/{1}".format(self.bucket_name, obj_path)

    def file_checksum(self, file_name):

        with open(file_name, "rb") as file_handle:
            md5 = hashlib.md5()
            # Read file in chunks of 4096 bytes
            block_size = 4096

            # Save current cursor position and reset cursor to start of file
            curpos = file_handle.tell()
            file_handle.seek(0)

            buf = file_handle.read(block_size)
            while len(buf) > 0:
                md5.update(buf)
                buf = file_handle.read(block_size)

            # Restore file cursor's position
            file_handle.seek(curpos)

            return md5.hexdigest()

    def to_path_style_s3_url(self, key, version=None):
        """
            This link describes the format of Path Style URLs
            http://docs.aws.amazon.com/AmazonS3/latest/dev/UsingBucket.html#access-bucket-intro
        """
        base = "https://s3.amazonaws.com"
        if self.region and self.region != "us-east-1":
            base = "https://s3-{0}.amazonaws.com".format(self.region)

        result = "{0}/{1}/{2}".format(base, self.bucket_name, key)
        if version:
            result = "{0}?versionId={1}".format(result, version)

        return result
Beispiel #3
0
class AtomicRemoteWritableS3File(object):
    """
    An S3 file that writes to a remote temp object on S3; copies to the true key on close.

    This class requires boto3 v1.4.0+ for its non-seekable file object upload ability.

    Useful for performing operations on large S3 objects when you don't have
    sufficient space on local drives.

    Works around AWS S3's multipart transfer size requirements and boto3's
    idiosyncratic implementation that requires an initial buffer size
    larger than the multipart transfer threshold in order to correctly
    select the 'read-until-empty' behavior needed for a streaming upload.
    """
    _boto3_default_multipart_threshold = 8 * 1024 * 1024

    def __init__(self, s3_bucket, s3_key, boto3_s3_client=None):
        import boto3
        from s3transfer.manager import TransferManager, TransferConfig

        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.s3_client = boto3_s3_client
        if self.s3_client is None:
            self.s3_client = boto3.client('s3')

        self._internal_queue = BlockingReaderWriterByteStream()
        self._boto3_multipart_upload_workaround_buffer = b''

        self.temp_s3_key = self.s3_key + '-{:0>10}-tmp'.format(
            random.randrange(0, 1e10))

        # don't start the upload until we've written at least
        # boto3.TransferConfig.multipart_threshold bytes
        self._transfer_manager = TransferManager(self.s3_client,
                                                 TransferConfig())
        self._upload_future = None

    def write(self, some_bytes):
        """
        Writes bytes to S3.

        This method may not be safely called by multiple writers in different threads.
        """
        self._write(some_bytes)

    def _write(self, some_bytes, close_and_flush=False):
        """
        Buffers writes until they're large enough to be safely sent to boto3.
        """
        buffer_write = (len(self._boto3_multipart_upload_workaround_buffer) +
                        len(some_bytes) <
                        self._boto3_default_multipart_threshold)
        self._boto3_multipart_upload_workaround_buffer += some_bytes
        if not buffer_write or close_and_flush:
            self._internal_queue.write(
                self._boto3_multipart_upload_workaround_buffer)
            self._boto3_multipart_upload_workaround_buffer = b''
            if not self._upload_future:
                self._submit_upload()

    def _submit_upload(self):
        self._upload_future = self._transfer_manager.upload(
            fileobj=self._internal_queue,
            bucket=self.s3_bucket,
            key=self.temp_s3_key)

    def close(self):
        """
        Closes the writer, so that it will flush to the reader.

        This method will block until the file has been fully flushed to S3,
        and until it has been properly moved to its final destination.
        """
        self._write(b'', close_and_flush=True)
        self._internal_queue.close()
        self._upload_future.result(
        )  # wait for upload to complete before moving
        self._move_to_final_destination()

    def _move_to_final_destination(self):
        self.s3_client.copy_object(Bucket=self.s3_bucket,
                                   Key=self.s3_key,
                                   CopySource={
                                       'Bucket': self.s3_bucket,
                                       'Key': self.temp_s3_key
                                   })
        self.s3_client.delete_object(Bucket=self.s3_bucket,
                                     Key=self.temp_s3_key)

    def __del__(self):
        self.close()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if exc_type:
            self._internal_queue.error('Pipe not properly closed.')
            if self._upload_future:
                self._upload_future.result()
                self.s3_client.delete_object(Bucket=self.s3_bucket,
                                             Key=self.temp_s3_key)
            return
        else:
            self.close()
Beispiel #4
0
class S3Transfer(object):
    ALLOWED_DOWNLOAD_ARGS = TransferManager.ALLOWED_DOWNLOAD_ARGS
    ALLOWED_UPLOAD_ARGS = TransferManager.ALLOWED_UPLOAD_ARGS

    def __init__(self, client=None, config=None, osutil=None, manager=None):
        if not client and not manager:
            raise ValueError(
                'Either a boto3.Client or s3transfer.manager.TransferManager '
                'must be provided'
            )
        if manager and any([client, config, osutil]):
            raise ValueError(
                'Manager cannot be provided with client, config, '
                'nor osutil. These parameters are mutually exclusive.'
            )
        if config is None:
            config = TransferConfig()
        if osutil is None:
            osutil = OSUtils()
        if manager:
            self._manager = manager
        else:
            self._manager = TransferManager(client, config, osutil)

    def upload_file(self, filename, bucket, key,
                    callback=None, extra_args=None):
        """Upload a file to an S3 object.

        Variants have also been injected into S3 client, Bucket and Object.
        You don't have to use S3Transfer.upload_file() directly.
        """
        if not isinstance(filename, six.string_types):
            raise ValueError('Filename must be a string')

        subscribers = self._get_subscribers(callback)
        future = self._manager.upload(
            filename, bucket, key, extra_args, subscribers)
        try:
            future.result()
        # If a client error was raised, add the backwards compatibility layer
        # that raises a S3UploadFailedError. These specific errors were only
        # ever thrown for upload_parts but now can be thrown for any related
        # client error.
        except ClientError as e:
            raise S3UploadFailedError(
                "Failed to upload %s to %s: %s" % (
                    filename, '/'.join([bucket, key]), e))

    def download_file(self, bucket, key, filename, extra_args=None,
                      callback=None):
        """Download an S3 object to a file.

        Variants have also been injected into S3 client, Bucket and Object.
        You don't have to use S3Transfer.download_file() directly.
        """
        if not isinstance(filename, six.string_types):
            raise ValueError('Filename must be a string')

        subscribers = self._get_subscribers(callback)
        future = self._manager.download(
            bucket, key, filename, extra_args, subscribers)
        try:
            future.result()
        # This is for backwards compatibility where when retries are
        # exceeded we need to throw the same error from boto3 instead of
        # s3transfer's built in RetriesExceededError as current users are
        # catching the boto3 one instead of the s3transfer exception to do
        # their own retries.
        except S3TransferRetriesExceededError as e:
            raise RetriesExceededError(e.last_exception)

    def _get_subscribers(self, callback):
        if not callback:
            return None
        return [ProgressCallbackInvoker(callback)]

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self._manager.__exit__(*args)
Beispiel #5
0
class S3Uploader(object):
    """
    Class to upload objects to S3 bucket that use versioning. If bucket
    does not already use versioning, this class will turn on versioning.
    """

    def __init__(self, s3_client,
                 bucket_name,
                 region,
                 prefix=None,
                 kms_key_id=None,
                 force_upload=False,
                 transfer_manager=None):
        self.bucket_name = bucket_name
        self.prefix = prefix
        self.kms_key_id = kms_key_id or None
        self.force_upload = force_upload
        self.s3 = s3_client
        self.region = region

        self.transfer_manager = transfer_manager
        if not transfer_manager:
            self.transfer_manager = TransferManager(self.s3)

    def upload(self, file_name, remote_path):
        """
        Uploads given file to S3
        :param file_name: Path to the file that will be uploaded
        :param remote_path:  be uploaded
        :return: VersionId of the latest upload
        """

        if self.prefix and len(self.prefix) > 0:
            remote_path = "{0}/{1}".format(self.prefix, remote_path)

        # Check if a file with same data exists
        if not self.force_upload and self.file_exists(remote_path):
            LOG.debug("File with same data is already exists at {0}. "
                      "Skipping upload".format(remote_path))
            return self.make_url(remote_path)

        try:

            # Default to regular server-side encryption unless customer has
            # specified their own KMS keys
            additional_args = {
                "ServerSideEncryption": "AES256"
            }

            if self.kms_key_id:
                additional_args["ServerSideEncryption"] = "aws:kms"
                additional_args["SSEKMSKeyId"] = self.kms_key_id

            print_progress_callback = \
                ProgressPercentage(file_name, remote_path)
            future = self.transfer_manager.upload(file_name,
                                                  self.bucket_name,
                                                  remote_path,
                                                  additional_args,
                                                  [print_progress_callback])
            future.result()

            return self.make_url(remote_path)

        except botocore.exceptions.ClientError as ex:
            error_code = ex.response["Error"]["Code"]
            if error_code == "NoSuchBucket":
                raise exceptions.NoSuchBucketError(
                        bucket_name=self.bucket_name)
            raise ex

    def upload_with_dedup(self, file_name, extension=None):
        """
        Makes and returns name of the S3 object based on the file's MD5 sum

        :param file_name: file to upload
        :param extension: String of file extension to append to the object
        :return: S3 URL of the uploaded object
        """

        # This construction of remote_path is critical to preventing duplicate
        # uploads of same object. Uploader will check if the file exists in S3
        # and re-upload only if necessary. So the template points to same file
        # in multiple places, this will upload only once

        filemd5 = self.file_checksum(file_name)
        remote_path = filemd5
        if extension:
            remote_path = remote_path + "." + extension

        return self.upload(file_name, remote_path)

    def file_exists(self, remote_path):
        """
        Check if the file we are trying to upload already exists in S3

        :param remote_path:
        :return: True, if file exists. False, otherwise
        """

        try:
            # Find the object that matches this ETag
            self.s3.head_object(
                Bucket=self.bucket_name, Key=remote_path)
            return True
        except botocore.exceptions.ClientError:
            # Either File does not exist or we are unable to get
            # this information.
            return False

    def make_url(self, obj_path):
        return "s3://{0}/{1}".format(
            self.bucket_name, obj_path)

    def file_checksum(self, file_name):

        with open(file_name, "rb") as file_handle:
            md5 = hashlib.md5()
            # Read file in chunks of 4096 bytes
            block_size = 4096

            # Save current cursor position and reset cursor to start of file
            curpos = file_handle.tell()
            file_handle.seek(0)

            buf = file_handle.read(block_size)
            while len(buf) > 0:
                md5.update(buf)
                buf = file_handle.read(block_size)

            # Restore file cursor's position
            file_handle.seek(curpos)

            return md5.hexdigest()

    def to_path_style_s3_url(self, key, version=None):
        """
            This link describes the format of Path Style URLs
            http://docs.aws.amazon.com/AmazonS3/latest/dev/UsingBucket.html#access-bucket-intro
        """
        base = "https://s3.amazonaws.com"
        if self.region and self.region != "us-east-1":
            base = "https://s3-{0}.amazonaws.com".format(self.region)

        result = "{0}/{1}/{2}".format(base, self.bucket_name, key)
        if version:
            result = "{0}?versionId={1}".format(result, version)

        return result
Beispiel #6
0
class S3Transfer(object):
    ALLOWED_DOWNLOAD_ARGS = TransferManager.ALLOWED_DOWNLOAD_ARGS
    ALLOWED_UPLOAD_ARGS = TransferManager.ALLOWED_UPLOAD_ARGS

    def __init__(self, client=None, config=None, osutil=None, manager=None):
        if not client and not manager:
            raise ValueError(
                'Either a boto3.Client or s3transfer.manager.TransferManager '
                'must be provided')
        if manager and any([client, config, osutil]):
            raise ValueError(
                'Manager cannot be provided with client, config, '
                'nor osutil. These parameters are mutually exclusive.')
        if config is None:
            config = TransferConfig()
        if osutil is None:
            osutil = OSUtils()
        if manager:
            self._manager = manager
        else:
            self._manager = TransferManager(client, config, osutil)

    def upload_file(self,
                    filename,
                    bucket,
                    key,
                    callback=None,
                    extra_args=None):
        """Upload a file to an S3 object.

        Variants have also been injected into S3 client, Bucket and Object.
        You don't have to use S3Transfer.upload_file() directly.
        """
        if not isinstance(filename, six.string_types):
            raise ValueError('Filename must be a string')

        subscribers = self._get_subscribers(callback)
        future = self._manager.upload(filename, bucket, key, extra_args,
                                      subscribers)
        try:
            future.result()
        # If a client error was raised, add the backwards compatibility layer
        # that raises a S3UploadFailedError. These specific errors were only
        # ever thrown for upload_parts but now can be thrown for any related
        # client error.
        except ClientError as e:
            raise S3UploadFailedError("Failed to upload %s to %s: %s" %
                                      (filename, '/'.join([bucket, key]), e))

    def download_file(self,
                      bucket,
                      key,
                      filename,
                      extra_args=None,
                      callback=None):
        """Download an S3 object to a file.

        Variants have also been injected into S3 client, Bucket and Object.
        You don't have to use S3Transfer.download_file() directly.
        """
        if not isinstance(filename, six.string_types):
            raise ValueError('Filename must be a string')

        subscribers = self._get_subscribers(callback)
        future = self._manager.download(bucket, key, filename, extra_args,
                                        subscribers)
        try:
            future.result()
        # This is for backwards compatibility where when retries are
        # exceeded we need to throw the same error from boto3 instead of
        # s3transfer's built in RetriesExceededError as current users are
        # catching the boto3 one instead of the s3transfer exception to do
        # their own retries.
        except S3TransferRetriesExceededError as e:
            raise RetriesExceededError(e.last_exception)

    def _get_subscribers(self, callback):
        if not callback:
            return None
        return [ProgressCallbackInvoker(callback)]

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self._manager.__exit__(*args)
Beispiel #7
0
def upload_file(**kwargs):
    """Upload manager."""
    filename = kwargs.pop('file')
    parent_id = kwargs.pop('parent_id', None)
    multipart_chunksize = file_size = os.stat(filename)[
        6]  #get size of file in bytes
    client = kwargs['client']

    if file_size > MULTIPART_THRESHOLD:  #bigger that 1GB
        multipart_chunksize = min(int(file_size / 10), int(MAX_CHUNK_SIZE))
        multipart_chunksize = max(multipart_chunksize, int(MIN_CHUNK_SIZE))
        LOGGER.info('File size: %s MB', file_size / MB)
        LOGGER.info('Chunk size: %s MB', int(multipart_chunksize / MB))
    config = TransferConfig(multipart_threshold=MULTIPART_THRESHOLD,
                            max_concurrency=MAX_CONCURRENCY,
                            multipart_chunksize=multipart_chunksize)
    osutil = OSUtilsWithCallbacks()
    # Check if given parent folder exists
    if parent_id:
        fl_obj = Files(base_url=kwargs['base_url'], api_key=kwargs['api_key'])
        res = fl_obj.get_list(parent_id=parent_id)
        if not res['status']:
            raise NotFoundException('Parent folder for upload does '
                                    'not exists.')

    transfer_manager = TransferManager(client, config=config, osutil=osutil)

    subscribers = [
        ProgressSubscriber(filename),
    ]

    _, file_name = os.path.split(filename)
    try:
        init_url = client.base_url + urls.UPLOAD_INIT_URL
        response = requests_retry_session().put(init_url,
                                                json=dict(file_name=file_name),
                                                headers=client.header)
        if response.status_code == 402:
            raise NotEnoughCredits('Insufficient credits for upload.')
        if response.status_code == 403:
            raise AuthenticationFailed('Authentication Failed. Wrong API Key.')
        if response.status_code == requests.codes.ok:
            sources = response.json()
            future = transfer_manager.upload(filename,
                                             sources['upload_source'],
                                             sources['upload_key'],
                                             extra_args=None,
                                             subscribers=subscribers)
        else:
            LOGGER.error(
                'File upload inititalisation Failed. '
                'Response code: %s', response.status_code)
            raise UploadException('File upload inititalisation Failed. '
                                  'Response code: %s' % response.status_code)
        try:
            future.result()
        except KeyboardInterrupt:
            do_not_retry_event.set()
            return
        return sources['upload_key']

        # If a client error was raised, add the backwards compatibility layer
        # that raises a S3UploadFailedError. These specific errors were only
        # ever thrown for upload_parts but now can be thrown for any related
        # client error.

    except ClientError as error:
        raise S3UploadFailedError("Failed to upload {} to {}: {}".format(
            filename,
            '/'.join([sources['upload_source'],
                      sources['upload_key']]), error))
Beispiel #8
0
def download_url(url, s3key, s3client=None, s3bucket=None, s3region=None):
    """Downloads the given URL to the given S3 destination object.

:type url: string
:param url: source URL to download

:type s3key: string
:param s3key: key name for the destination S3 object

:type s3client: botocore.client.S3
:param s3client: S3 client to use (optional)

:type s3bucket: string
:param s3bucket: destination S3 bucket name (optional)

:type s3region: string
:param s3region: destination AWS region, e.g. 'us-east-1' (optional)

Will download the content at the given URL to the given S3 destination. Will
consult the usual boto environment variables and configuration to create a
client as needed. If the destination region is not provided, the AWS_REGION
environment variable will be consulted instead."""
    if s3client is not None:
        s3c = s3client
    else:
        s3c = _get_client(s3region)

    if s3bucket is None and 'AWS_S3_BUCKET' not in os.environ:
        msg = "No S3 bucket name configured"
        logging.error(msg)
        raise ConfigurationException(msg)
    if s3bucket is None: s3bucket = os.environ['AWS_S3_BUCKET']

    try:
        logging.info("Validating existence of bucket %s..." % s3bucket)
        start = time.time()
        s3c.head_bucket(Bucket=s3bucket)
        end = time.time()
        logging.info("Bucket %s exists (%ld ms)" %
                     (s3bucket, int((end - start) * 1000.0)))
    except ClientError as e:
        logging.info("Creating bucket %s..." % s3bucket)
        start = time.time()
        s3c.create_bucket(Bucket=s3bucket)
        end = time.time()
        logging.info("Created bucket %s (%ld ms)" %
                     (s3bucket, int((end - start) * 1000.0)))

    logging.info("Checking metadata on %s..." % url)
    start = time.time()
    req = urllib.request.Request(url, headers={'Accept-Encoding': 'gzip'})
    u = urllib.request.urlopen(req)
    end = time.time()
    logging.info("Fetched metadata on %s (%ld ms)" %
                 (url, int((end - start) * 1000.0)))

    if _is_up_to_date(u, s3c, s3bucket, s3key):
        u.close()
        logging.info("Skipping download of %s to s3://%s/%s (up-to-date)" %
                     (url, s3bucket, s3key))
        return

    tm = TransferManager(s3c)
    extra_args = {'ACL': 'private'}
    _set_upload_arg(extra_args, u, 'Content-Encoding', 'ContentEncoding')
    _set_upload_arg(extra_args, u, 'Content-Type', 'ContentType')
    _set_metadata(extra_args, u)

    logging.info("Beginning download of %s to s3://%s/%s..." %
                 (url, s3bucket, s3key))
    start = time.time()
    f = tm.upload(u, s3bucket, s3key, extra_args=extra_args)
    f.result()
    end = time.time()
    logging.info("Download of %s to s3://%s/%s complete (%ld ms)" %
                 (url, s3bucket, s3key, int((end - start) * 1000.0)))