class S3Bucket: def __init__(self, bucket, endpoint, id, key, region): self.bucket = bucket self.service_endpoint = endpoint self.aws_access_key_id = id self.aws_secret_access_key = key self.region_name = region self.client = boto3.client( 's3', endpoint_url=self.service_endpoint, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, region_name=self.region_name) self.transfer = TransferManager(self.client, None, None, None) def manager_upload(self, file): self.transfer.upload(file, self.bucket, file[1:], None, None) def upload(self, file, filekey): exist = self.client.list_objects(Bucket=self.bucket, Prefix=filekey[1:]) if exist: return False try: with open(file, 'rb') as f: self.client.upload_fileobj(f, self.bucket, filekey[1:]) except Exception as ex: traceback.print_exc() raise return True
class S3Uploader(object): """ Class to upload objects to S3 bucket that use versioning. If bucket does not already use versioning, this class will turn on versioning. """ def __init__(self, s3_client, bucket_name, region, prefix=None, kms_key_id=None, force_upload=False, transfer_manager=None): self.bucket_name = bucket_name self.prefix = prefix self.kms_key_id = kms_key_id or None self.force_upload = force_upload self.s3 = s3_client self.region = region self.transfer_manager = transfer_manager if not transfer_manager: self.transfer_manager = TransferManager(self.s3) def upload(self, file_name, remote_path): """ Uploads given file to S3 :param file_name: Path to the file that will be uploaded :param remote_path: be uploaded :return: VersionId of the latest upload """ if self.prefix and len(self.prefix) > 0: remote_path = "{0}/{1}".format(self.prefix, remote_path) # Check if a file with same data exists if not self.force_upload and self.file_exists(remote_path): LOG.debug("File with same data is already exists at {0}. " "Skipping upload".format(remote_path)) return self.make_url(remote_path) try: # Default to regular server-side encryption unless customer has # specified their own KMS keys additional_args = {"ServerSideEncryption": "AES256"} if self.kms_key_id: additional_args["ServerSideEncryption"] = "aws:kms" additional_args["SSEKMSKeyId"] = self.kms_key_id print_progress_callback = \ ProgressPercentage(file_name, remote_path) future = self.transfer_manager.upload(file_name, self.bucket_name, remote_path, additional_args, [print_progress_callback]) future.result() return self.make_url(remote_path) except botocore.exceptions.ClientError as ex: error_code = ex.response["Error"]["Code"] if error_code == "NoSuchBucket": raise exceptions.NoSuchBucketError( bucket_name=self.bucket_name) raise ex def upload_with_dedup(self, file_name, extension=None): """ Makes and returns name of the S3 object based on the file's MD5 sum :param file_name: file to upload :param extension: String of file extension to append to the object :return: S3 URL of the uploaded object """ # This construction of remote_path is critical to preventing duplicate # uploads of same object. Uploader will check if the file exists in S3 # and re-upload only if necessary. So the template points to same file # in multiple places, this will upload only once filemd5 = self.file_checksum(file_name) remote_path = filemd5 if extension: remote_path = remote_path + "." + extension return self.upload(file_name, remote_path) def file_exists(self, remote_path): """ Check if the file we are trying to upload already exists in S3 :param remote_path: :return: True, if file exists. False, otherwise """ try: # Find the object that matches this ETag self.s3.head_object(Bucket=self.bucket_name, Key=remote_path) return True except botocore.exceptions.ClientError: # Either File does not exist or we are unable to get # this information. return False def make_url(self, obj_path): return "s3://{0}/{1}".format(self.bucket_name, obj_path) def file_checksum(self, file_name): with open(file_name, "rb") as file_handle: md5 = hashlib.md5() # Read file in chunks of 4096 bytes block_size = 4096 # Save current cursor position and reset cursor to start of file curpos = file_handle.tell() file_handle.seek(0) buf = file_handle.read(block_size) while len(buf) > 0: md5.update(buf) buf = file_handle.read(block_size) # Restore file cursor's position file_handle.seek(curpos) return md5.hexdigest() def to_path_style_s3_url(self, key, version=None): """ This link describes the format of Path Style URLs http://docs.aws.amazon.com/AmazonS3/latest/dev/UsingBucket.html#access-bucket-intro """ base = "https://s3.amazonaws.com" if self.region and self.region != "us-east-1": base = "https://s3-{0}.amazonaws.com".format(self.region) result = "{0}/{1}/{2}".format(base, self.bucket_name, key) if version: result = "{0}?versionId={1}".format(result, version) return result
class AtomicRemoteWritableS3File(object): """ An S3 file that writes to a remote temp object on S3; copies to the true key on close. This class requires boto3 v1.4.0+ for its non-seekable file object upload ability. Useful for performing operations on large S3 objects when you don't have sufficient space on local drives. Works around AWS S3's multipart transfer size requirements and boto3's idiosyncratic implementation that requires an initial buffer size larger than the multipart transfer threshold in order to correctly select the 'read-until-empty' behavior needed for a streaming upload. """ _boto3_default_multipart_threshold = 8 * 1024 * 1024 def __init__(self, s3_bucket, s3_key, boto3_s3_client=None): import boto3 from s3transfer.manager import TransferManager, TransferConfig self.s3_bucket = s3_bucket self.s3_key = s3_key self.s3_client = boto3_s3_client if self.s3_client is None: self.s3_client = boto3.client('s3') self._internal_queue = BlockingReaderWriterByteStream() self._boto3_multipart_upload_workaround_buffer = b'' self.temp_s3_key = self.s3_key + '-{:0>10}-tmp'.format( random.randrange(0, 1e10)) # don't start the upload until we've written at least # boto3.TransferConfig.multipart_threshold bytes self._transfer_manager = TransferManager(self.s3_client, TransferConfig()) self._upload_future = None def write(self, some_bytes): """ Writes bytes to S3. This method may not be safely called by multiple writers in different threads. """ self._write(some_bytes) def _write(self, some_bytes, close_and_flush=False): """ Buffers writes until they're large enough to be safely sent to boto3. """ buffer_write = (len(self._boto3_multipart_upload_workaround_buffer) + len(some_bytes) < self._boto3_default_multipart_threshold) self._boto3_multipart_upload_workaround_buffer += some_bytes if not buffer_write or close_and_flush: self._internal_queue.write( self._boto3_multipart_upload_workaround_buffer) self._boto3_multipart_upload_workaround_buffer = b'' if not self._upload_future: self._submit_upload() def _submit_upload(self): self._upload_future = self._transfer_manager.upload( fileobj=self._internal_queue, bucket=self.s3_bucket, key=self.temp_s3_key) def close(self): """ Closes the writer, so that it will flush to the reader. This method will block until the file has been fully flushed to S3, and until it has been properly moved to its final destination. """ self._write(b'', close_and_flush=True) self._internal_queue.close() self._upload_future.result( ) # wait for upload to complete before moving self._move_to_final_destination() def _move_to_final_destination(self): self.s3_client.copy_object(Bucket=self.s3_bucket, Key=self.s3_key, CopySource={ 'Bucket': self.s3_bucket, 'Key': self.temp_s3_key }) self.s3_client.delete_object(Bucket=self.s3_bucket, Key=self.temp_s3_key) def __del__(self): self.close() def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): if exc_type: self._internal_queue.error('Pipe not properly closed.') if self._upload_future: self._upload_future.result() self.s3_client.delete_object(Bucket=self.s3_bucket, Key=self.temp_s3_key) return else: self.close()
class S3Transfer(object): ALLOWED_DOWNLOAD_ARGS = TransferManager.ALLOWED_DOWNLOAD_ARGS ALLOWED_UPLOAD_ARGS = TransferManager.ALLOWED_UPLOAD_ARGS def __init__(self, client=None, config=None, osutil=None, manager=None): if not client and not manager: raise ValueError( 'Either a boto3.Client or s3transfer.manager.TransferManager ' 'must be provided' ) if manager and any([client, config, osutil]): raise ValueError( 'Manager cannot be provided with client, config, ' 'nor osutil. These parameters are mutually exclusive.' ) if config is None: config = TransferConfig() if osutil is None: osutil = OSUtils() if manager: self._manager = manager else: self._manager = TransferManager(client, config, osutil) def upload_file(self, filename, bucket, key, callback=None, extra_args=None): """Upload a file to an S3 object. Variants have also been injected into S3 client, Bucket and Object. You don't have to use S3Transfer.upload_file() directly. """ if not isinstance(filename, six.string_types): raise ValueError('Filename must be a string') subscribers = self._get_subscribers(callback) future = self._manager.upload( filename, bucket, key, extra_args, subscribers) try: future.result() # If a client error was raised, add the backwards compatibility layer # that raises a S3UploadFailedError. These specific errors were only # ever thrown for upload_parts but now can be thrown for any related # client error. except ClientError as e: raise S3UploadFailedError( "Failed to upload %s to %s: %s" % ( filename, '/'.join([bucket, key]), e)) def download_file(self, bucket, key, filename, extra_args=None, callback=None): """Download an S3 object to a file. Variants have also been injected into S3 client, Bucket and Object. You don't have to use S3Transfer.download_file() directly. """ if not isinstance(filename, six.string_types): raise ValueError('Filename must be a string') subscribers = self._get_subscribers(callback) future = self._manager.download( bucket, key, filename, extra_args, subscribers) try: future.result() # This is for backwards compatibility where when retries are # exceeded we need to throw the same error from boto3 instead of # s3transfer's built in RetriesExceededError as current users are # catching the boto3 one instead of the s3transfer exception to do # their own retries. except S3TransferRetriesExceededError as e: raise RetriesExceededError(e.last_exception) def _get_subscribers(self, callback): if not callback: return None return [ProgressCallbackInvoker(callback)] def __enter__(self): return self def __exit__(self, *args): self._manager.__exit__(*args)
class S3Uploader(object): """ Class to upload objects to S3 bucket that use versioning. If bucket does not already use versioning, this class will turn on versioning. """ def __init__(self, s3_client, bucket_name, region, prefix=None, kms_key_id=None, force_upload=False, transfer_manager=None): self.bucket_name = bucket_name self.prefix = prefix self.kms_key_id = kms_key_id or None self.force_upload = force_upload self.s3 = s3_client self.region = region self.transfer_manager = transfer_manager if not transfer_manager: self.transfer_manager = TransferManager(self.s3) def upload(self, file_name, remote_path): """ Uploads given file to S3 :param file_name: Path to the file that will be uploaded :param remote_path: be uploaded :return: VersionId of the latest upload """ if self.prefix and len(self.prefix) > 0: remote_path = "{0}/{1}".format(self.prefix, remote_path) # Check if a file with same data exists if not self.force_upload and self.file_exists(remote_path): LOG.debug("File with same data is already exists at {0}. " "Skipping upload".format(remote_path)) return self.make_url(remote_path) try: # Default to regular server-side encryption unless customer has # specified their own KMS keys additional_args = { "ServerSideEncryption": "AES256" } if self.kms_key_id: additional_args["ServerSideEncryption"] = "aws:kms" additional_args["SSEKMSKeyId"] = self.kms_key_id print_progress_callback = \ ProgressPercentage(file_name, remote_path) future = self.transfer_manager.upload(file_name, self.bucket_name, remote_path, additional_args, [print_progress_callback]) future.result() return self.make_url(remote_path) except botocore.exceptions.ClientError as ex: error_code = ex.response["Error"]["Code"] if error_code == "NoSuchBucket": raise exceptions.NoSuchBucketError( bucket_name=self.bucket_name) raise ex def upload_with_dedup(self, file_name, extension=None): """ Makes and returns name of the S3 object based on the file's MD5 sum :param file_name: file to upload :param extension: String of file extension to append to the object :return: S3 URL of the uploaded object """ # This construction of remote_path is critical to preventing duplicate # uploads of same object. Uploader will check if the file exists in S3 # and re-upload only if necessary. So the template points to same file # in multiple places, this will upload only once filemd5 = self.file_checksum(file_name) remote_path = filemd5 if extension: remote_path = remote_path + "." + extension return self.upload(file_name, remote_path) def file_exists(self, remote_path): """ Check if the file we are trying to upload already exists in S3 :param remote_path: :return: True, if file exists. False, otherwise """ try: # Find the object that matches this ETag self.s3.head_object( Bucket=self.bucket_name, Key=remote_path) return True except botocore.exceptions.ClientError: # Either File does not exist or we are unable to get # this information. return False def make_url(self, obj_path): return "s3://{0}/{1}".format( self.bucket_name, obj_path) def file_checksum(self, file_name): with open(file_name, "rb") as file_handle: md5 = hashlib.md5() # Read file in chunks of 4096 bytes block_size = 4096 # Save current cursor position and reset cursor to start of file curpos = file_handle.tell() file_handle.seek(0) buf = file_handle.read(block_size) while len(buf) > 0: md5.update(buf) buf = file_handle.read(block_size) # Restore file cursor's position file_handle.seek(curpos) return md5.hexdigest() def to_path_style_s3_url(self, key, version=None): """ This link describes the format of Path Style URLs http://docs.aws.amazon.com/AmazonS3/latest/dev/UsingBucket.html#access-bucket-intro """ base = "https://s3.amazonaws.com" if self.region and self.region != "us-east-1": base = "https://s3-{0}.amazonaws.com".format(self.region) result = "{0}/{1}/{2}".format(base, self.bucket_name, key) if version: result = "{0}?versionId={1}".format(result, version) return result
class S3Transfer(object): ALLOWED_DOWNLOAD_ARGS = TransferManager.ALLOWED_DOWNLOAD_ARGS ALLOWED_UPLOAD_ARGS = TransferManager.ALLOWED_UPLOAD_ARGS def __init__(self, client=None, config=None, osutil=None, manager=None): if not client and not manager: raise ValueError( 'Either a boto3.Client or s3transfer.manager.TransferManager ' 'must be provided') if manager and any([client, config, osutil]): raise ValueError( 'Manager cannot be provided with client, config, ' 'nor osutil. These parameters are mutually exclusive.') if config is None: config = TransferConfig() if osutil is None: osutil = OSUtils() if manager: self._manager = manager else: self._manager = TransferManager(client, config, osutil) def upload_file(self, filename, bucket, key, callback=None, extra_args=None): """Upload a file to an S3 object. Variants have also been injected into S3 client, Bucket and Object. You don't have to use S3Transfer.upload_file() directly. """ if not isinstance(filename, six.string_types): raise ValueError('Filename must be a string') subscribers = self._get_subscribers(callback) future = self._manager.upload(filename, bucket, key, extra_args, subscribers) try: future.result() # If a client error was raised, add the backwards compatibility layer # that raises a S3UploadFailedError. These specific errors were only # ever thrown for upload_parts but now can be thrown for any related # client error. except ClientError as e: raise S3UploadFailedError("Failed to upload %s to %s: %s" % (filename, '/'.join([bucket, key]), e)) def download_file(self, bucket, key, filename, extra_args=None, callback=None): """Download an S3 object to a file. Variants have also been injected into S3 client, Bucket and Object. You don't have to use S3Transfer.download_file() directly. """ if not isinstance(filename, six.string_types): raise ValueError('Filename must be a string') subscribers = self._get_subscribers(callback) future = self._manager.download(bucket, key, filename, extra_args, subscribers) try: future.result() # This is for backwards compatibility where when retries are # exceeded we need to throw the same error from boto3 instead of # s3transfer's built in RetriesExceededError as current users are # catching the boto3 one instead of the s3transfer exception to do # their own retries. except S3TransferRetriesExceededError as e: raise RetriesExceededError(e.last_exception) def _get_subscribers(self, callback): if not callback: return None return [ProgressCallbackInvoker(callback)] def __enter__(self): return self def __exit__(self, *args): self._manager.__exit__(*args)
def upload_file(**kwargs): """Upload manager.""" filename = kwargs.pop('file') parent_id = kwargs.pop('parent_id', None) multipart_chunksize = file_size = os.stat(filename)[ 6] #get size of file in bytes client = kwargs['client'] if file_size > MULTIPART_THRESHOLD: #bigger that 1GB multipart_chunksize = min(int(file_size / 10), int(MAX_CHUNK_SIZE)) multipart_chunksize = max(multipart_chunksize, int(MIN_CHUNK_SIZE)) LOGGER.info('File size: %s MB', file_size / MB) LOGGER.info('Chunk size: %s MB', int(multipart_chunksize / MB)) config = TransferConfig(multipart_threshold=MULTIPART_THRESHOLD, max_concurrency=MAX_CONCURRENCY, multipart_chunksize=multipart_chunksize) osutil = OSUtilsWithCallbacks() # Check if given parent folder exists if parent_id: fl_obj = Files(base_url=kwargs['base_url'], api_key=kwargs['api_key']) res = fl_obj.get_list(parent_id=parent_id) if not res['status']: raise NotFoundException('Parent folder for upload does ' 'not exists.') transfer_manager = TransferManager(client, config=config, osutil=osutil) subscribers = [ ProgressSubscriber(filename), ] _, file_name = os.path.split(filename) try: init_url = client.base_url + urls.UPLOAD_INIT_URL response = requests_retry_session().put(init_url, json=dict(file_name=file_name), headers=client.header) if response.status_code == 402: raise NotEnoughCredits('Insufficient credits for upload.') if response.status_code == 403: raise AuthenticationFailed('Authentication Failed. Wrong API Key.') if response.status_code == requests.codes.ok: sources = response.json() future = transfer_manager.upload(filename, sources['upload_source'], sources['upload_key'], extra_args=None, subscribers=subscribers) else: LOGGER.error( 'File upload inititalisation Failed. ' 'Response code: %s', response.status_code) raise UploadException('File upload inititalisation Failed. ' 'Response code: %s' % response.status_code) try: future.result() except KeyboardInterrupt: do_not_retry_event.set() return return sources['upload_key'] # If a client error was raised, add the backwards compatibility layer # that raises a S3UploadFailedError. These specific errors were only # ever thrown for upload_parts but now can be thrown for any related # client error. except ClientError as error: raise S3UploadFailedError("Failed to upload {} to {}: {}".format( filename, '/'.join([sources['upload_source'], sources['upload_key']]), error))
def download_url(url, s3key, s3client=None, s3bucket=None, s3region=None): """Downloads the given URL to the given S3 destination object. :type url: string :param url: source URL to download :type s3key: string :param s3key: key name for the destination S3 object :type s3client: botocore.client.S3 :param s3client: S3 client to use (optional) :type s3bucket: string :param s3bucket: destination S3 bucket name (optional) :type s3region: string :param s3region: destination AWS region, e.g. 'us-east-1' (optional) Will download the content at the given URL to the given S3 destination. Will consult the usual boto environment variables and configuration to create a client as needed. If the destination region is not provided, the AWS_REGION environment variable will be consulted instead.""" if s3client is not None: s3c = s3client else: s3c = _get_client(s3region) if s3bucket is None and 'AWS_S3_BUCKET' not in os.environ: msg = "No S3 bucket name configured" logging.error(msg) raise ConfigurationException(msg) if s3bucket is None: s3bucket = os.environ['AWS_S3_BUCKET'] try: logging.info("Validating existence of bucket %s..." % s3bucket) start = time.time() s3c.head_bucket(Bucket=s3bucket) end = time.time() logging.info("Bucket %s exists (%ld ms)" % (s3bucket, int((end - start) * 1000.0))) except ClientError as e: logging.info("Creating bucket %s..." % s3bucket) start = time.time() s3c.create_bucket(Bucket=s3bucket) end = time.time() logging.info("Created bucket %s (%ld ms)" % (s3bucket, int((end - start) * 1000.0))) logging.info("Checking metadata on %s..." % url) start = time.time() req = urllib.request.Request(url, headers={'Accept-Encoding': 'gzip'}) u = urllib.request.urlopen(req) end = time.time() logging.info("Fetched metadata on %s (%ld ms)" % (url, int((end - start) * 1000.0))) if _is_up_to_date(u, s3c, s3bucket, s3key): u.close() logging.info("Skipping download of %s to s3://%s/%s (up-to-date)" % (url, s3bucket, s3key)) return tm = TransferManager(s3c) extra_args = {'ACL': 'private'} _set_upload_arg(extra_args, u, 'Content-Encoding', 'ContentEncoding') _set_upload_arg(extra_args, u, 'Content-Type', 'ContentType') _set_metadata(extra_args, u) logging.info("Beginning download of %s to s3://%s/%s..." % (url, s3bucket, s3key)) start = time.time() f = tm.upload(u, s3bucket, s3key, extra_args=extra_args) f.result() end = time.time() logging.info("Download of %s to s3://%s/%s complete (%ld ms)" % (url, s3bucket, s3key, int((end - start) * 1000.0)))