def test_alias_max_io_queue(self): ref_value = 10 config = TransferConfig(max_io_queue=ref_value) self.assert_value_of_actual_and_alias( config, 'max_io_queue_size', 'max_io_queue', ref_value) # Set a new value using the alias new_value = 15 config.max_io_queue = new_value # Make sure it sets the value for both the alias and the actual # value that will be used in the TransferManager self.assert_value_of_actual_and_alias( config, 'max_io_queue_size', 'max_io_queue', new_value)
return True def check_head_object_works_for_client(s3_client, params): try: s3_client.head_object(**params) except ClientError as e: if e.response["Error"]["Code"] == "403": # This can also happen if you have full get_object access, but not list_objects_v2, and the object does not # exist. Instead of returning a 404, S3 will return a 403. return False return True s3_transfer_config = TransferConfig() # When uploading files at least this size, compare the ETags first and skip the upload if they're equal; # copy the remote file onto itself if the metadata changes. UPLOAD_ETAG_OPTIMIZATION_THRESHOLD = 1024 def _copy_local_file(ctx, size, src_path, dest_path): pathlib.Path(dest_path).parent.mkdir(parents=True, exist_ok=True) # TODO(dima): More detailed progress. shutil.copyfile(src_path, dest_path) ctx.progress(size) shutil.copymode(src_path, dest_path) ctx.done(PhysicalKey.from_path(dest_path))
def s3_upload(bucket, local_filepath, s3_filepath, profile_name='default', region_name='us-west-2', multipart_threshold=8388608, multipart_chunksize=8388608): """ Uploads a file or collection of files to S3 Parameters ---------- bucket : str name of S3 bucket local_filepath : str or list path and filename(s) to be uploaded s3_filepath : str or list path and filename(s) within the bucket for the file to be uploaded region_name : str name of AWS region (default value 'us-west-2') profile_name : str profile name for credentials (default 'default' or organization-specific) multipart_threshold : int minimum file size to initiate multipart upload multipart_chunksize : int chunksize for multipart upload Returns ------- None Example use ----------- # Uploading a single file to S3: s3_upload( bucket='my_bucket', local_filepath='../data/my_file.csv', s3_filepath='tmp/my_file.csv') # Uploading with a profile name: s3_upload( bucket='my_bucket', profile_name='my-profile-name', local_filepath='../data/my_file.csv', s3_filepath='tmp/my_file.csv') Uploading a list of files to S3 (will not upload contents of subdirectories): s3_upload( bucket='my_bucket', local_filepath=['../data/my_file1.csv', '../data/my_file2.csv', '../img.png'], s3_filepath=['tmp/my_file1.csv', 'tmp/my_file2.csv', 'img.png']) Uploading files matching a pattern to S3 (will not upload contents of subdirectories): s3_upload( bucket='my_bucket', local_filepath='../data/*.csv', s3_filepath='tmp/') Uploading all files in a directory to S3 (will not upload contents of subdirectories): s3_upload( bucket='my_bucket', local_filepath='../data/*' s3_filepath='tmp/') """ _download_upload_filepath_validator(s3_filepath=s3_filepath, local_filepath=local_filepath) my_bucket = s3_get_bucket(bucket=bucket, profile_name=profile_name, region_name=region_name) # multipart_threshold and multipart_chunksize, defaults = Amazon defaults config = TransferConfig(multipart_threshold=multipart_threshold, multipart_chunksize=multipart_chunksize) if isinstance(local_filepath, str): if '*' in local_filepath: items = glob.glob(local_filepath) # filter out directories local_filepath = [item for item in items if os.path.isfile(item)] tmp_s3_filepath = [ s3_filepath + f.split('/')[-1] for f in local_filepath ] s3_filepath = tmp_s3_filepath else: local_filepath = [local_filepath] s3_filepath = [s3_filepath] # upload all files to S3 for local_file, s3_key in zip(local_filepath, s3_filepath): try: my_bucket.upload_file(local_file, s3_key, Config=config) except boto3.exceptions.S3UploadFailedError as e: raise S3UploadFailedError(str(e)) return
def remote_uri(uri, filename, action): """ :param uri: uri of the container of the file :param filename: filename to act on :param action: must be one of [`up`, `down`, `list`, `del`] """ if not re.match('\w+://\w+.*', uri): return uri tmp = uri.split('://') system = tmp[0] location = '://'.join(tmp[1:]) if action not in ['down', 'up', 'list', 'del']: raise AttributeError( 'remote_uri action attribute must be one of [`up`, `down`, `list`, `del`]' ) if system == 's3': import boto3 from boto3.s3.transfer import TransferConfig s3bucket = location.split('/')[0] s3connection = boto3.resource('s3') s3filename = '/'.join(location.split('/')[1:]) if action == 'list': printd('Listing %s' % (uri), topic='s3') files = list( map(lambda x: x.key, s3connection.Bucket(s3bucket).objects.all())) s3filename = s3filename.strip('/') if s3filename: files = filter(lambda x: x.startswith(s3filename), files) return files if action == 'del': if filename is None: filename = s3filename.split('/')[-1] printd('Deleting %s' % uri, topic='s3') s3connection.Object(s3bucket, s3filename).delete() elif action == 'down': if filename is None: filename = s3filename.split('/')[-1] printd('Downloading %s to %s' % (uri, filename), topic='s3') obj = s3connection.Object(s3bucket, s3filename) if not os.path.exists(os.path.abspath(os.path.split(filename)[0])): os.makedirs(os.path.abspath(os.path.split(filename)[0])) obj.download_file(filename, Config=TransferConfig(use_threads=False)) elif action == 'up': printd('Uploading %s to %s' % (filename, uri), topic='s3') from botocore.exceptions import ClientError if s3filename.endswith('/'): s3filename += filename.split('/')[-1] try: s3connection.meta.client.head_bucket(Bucket=s3bucket) except ClientError as _excp: # If a client error is thrown, then check that it was a 404 error. # If it was a 404 error, then the bucket does not exist. error_code = int(_excp.response['Error']['Code']) if error_code == 404: s3connection.create_bucket(Bucket=s3bucket) else: raise bucket = s3connection.Bucket(s3bucket) with open(filename, 'rb') as data: bucket.put_object(Key=s3filename, Body=data) # , Metadata=meta)
#!/usr/bin/env python import argparse import boto3 from boto3.s3.transfer import TransferConfig parser = argparse.ArgumentParser() parser.add_argument('-chunksize', type=int, default=1000, help='size of each part in MB') parser.add_argument('-sourcekey', help='source object location') parser.add_argument('-targetkey', help='location to copy to') args = parser.parse_args() # Convert chunksize from MB to bytes chunksize = args.chunksize * 1000000 transferConfig = TransferConfig(multipart_threshold=chunksize, multipart_chunksize=chunksize) s3 = boto3.resource('s3') copy_source = {'Bucket': 'web-language-models', 'Key': args.sourcekey} s3.meta.client.copy(copy_source, 'web-language-models', args.targetkey, Config=transferConfig)
def download_fileobj(self, Bucket, Key, Fileobj, ExtraArgs=None, Callback=None, Config=None): """Download an object from S3 to a file-like object. The file-like object must be in binary mode. This is a managed transfer which will perform a multipart download in multiple threads if necessary. Usage:: import boto3 s3 = boto3.client('s3') with open('filename', 'wb') as data: s3.download_fileobj('mybucket', 'mykey', data) :type Fileobj: a file-like object :param Fileobj: A file-like object to download into. At a minimum, it must implement the `write` method and must accept bytes. :type Bucket: str :param Bucket: The name of the bucket to download from. :type Key: str :param Key: The name of the key to download from. :type ExtraArgs: dict :param ExtraArgs: Extra arguments that may be passed to the client operation. :type Callback: function :param Callback: A method which takes a number of bytes transferred to be periodically called during the download. :type Config: boto3.s3.transfer.TransferConfig :param Config: The transfer configuration to be used when performing the download. """ if not hasattr(Fileobj, 'write'): raise ValueError('Fileobj must implement write') subscribers = None if Callback is not None: subscribers = [ProgressCallbackInvoker(Callback)] config = Config if config is None: config = TransferConfig() with create_transfer_manager(self, config) as manager: future = manager.download(bucket=Bucket, key=Key, fileobj=Fileobj, extra_args=ExtraArgs, subscribers=subscribers) return future.result()
from ..debug import debug try: # python2 from urlparse import urlparse except: # python3 from urllib.parse import urlparse from .s3util import get_s3_client, read_in_chunks, get_timestamp try: import boto3 from boto3.s3.transfer import TransferConfig DOWNLOAD_FILE_THRESHOLD = 2 * TransferConfig().multipart_threshold DOWNLOAD_MAX_CHUNK = 2 * 1024 * 1024 * 1024 - 1 boto_found = True except: boto_found = False def ensure_unicode(x): return None if x is None else to_unicode(x) S3GetObject = namedtuple_with_defaults("S3GetObject", "key offset length") S3PutObject = namedtuple_with_defaults( "S3PutObject", "key value path content_type metadata",
def __init__(self, client, resource, config=None): self.client = client self.resource = resource if not config: config = TransferConfig(max_concurrency=1, use_threads=False) self.config = config
import os from urllib import parse from botocore.client import Config from botocore.exceptions import ClientError as S3ClientError from boto3.s3.transfer import TransferConfig import logging # Define Environmental Variables target_bucket = os.environ['destination_bucket'] my_max_pool_connections = int(os.environ['max_pool_connections']) my_max_concurrency = int(os.environ['max_concurrency']) my_multipart_chunksize = int(os.environ['multipart_chunksize']) my_max_attempts = int(os.environ['max_attempts']) # Set and Declare COnfiguration Parameters transfer_config = TransferConfig(max_concurrency=my_max_concurrency, multipart_chunksize=my_multipart_chunksize) config = Config(max_pool_connections=my_max_pool_connections, retries={'max_attempts': my_max_attempts}) # Instantiate S3Client s3Client = boto3.resource('s3', config=config) # # Set up logging # logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(asctime)s: %(message)s') logger = logging.getLogger(__name__) logger.setLevel('INFO') # Enable Verbose logging for Troubleshooting # boto3.set_stream_logger("")
def run(self): while True and not self.shutdown_flag.is_set(): if self.upload and self.upload_tries < 5: self.upload_tries += 1 else: upload = self.upload_queue.get() self.upload = upload[0] self.expired = upload[1] if self.upload == "STOP": if self.upload_queue.qsize() == 0: self.upload_queue.put(["STOP", False]) self.shutdown_flag.set() break else: self.progress_queue.put(None) self.upload_tries = 0 self.upload = None self.upload_queue.task_done() upload = self.upload_queue.get() self.upload = upload[0] self.expired = upload[1] file_id, credentials, bucket, key, full_path, file_size = self.upload_config( ) prefix = 'submission_{}'.format(self.submission_id) """ Methods for file transfer: If the source file is local, use the credentials supplied by the submission API to upload from local file to remote S3 location. If the source file is from S3, use a specific AWS Profile to retrieve the source file, and uses credentials supplied by the submission API to upload to remote S3 location. If the file was uploaded using multi-part, it will first complete the multi part uploads. """ expired_error = False mpu_exist = False for upload in self.all_mpus: if upload['Key'] == key: mpu_exist = True mpu_to_complete = upload break if full_path.startswith('s3'): """ Assumes you are uploading from external s3 bucket. SOURCE_BUCKET and SOURCE_PREFIX are hard-coded values, which specify where the object should be copied from (i.e., 100206 subject directory can be located in s3://hcp-openaccess-temp, with a prefix of HCP_1200). Creates source and destination clients for S3 tranfer. Use permanent credentials for accessing both buckets and accounts. This will require permission from NDA to write to NDA buckets. The transfer uses a file streaming method by streaming the body of the file into memory and uploads the stream in chunks using AWS S3Transfer. This Transfer client will automatically use multi part uploads when necessary. To maximize efficiency, only files greater than 8e6 bytes are uploaded using the multi part upload. Smaller files are uploaded in one part. After each successful transfer, the script will change the status of the file to complete in NDA's submission webservice. NOTE: For best results and to be cost effective, it is best to perform this file transfer in an AWS EC2 instance. """ tqdm.monitor_interval = 0 source_session = boto3.Session( aws_access_key_id=self.aws_access_key, aws_secret_access_key=self.aws_secret_key) config = Config(connect_timeout=240, read_timeout=240) self.source_s3 = source_session.resource('s3', config=config) source_key = key.split('/')[1:] source_key = '/'.join(source_key) self.source_key = '/'.join( [self.source_prefix, source_key]) self.fileobj = self.source_s3.Object( self.source_bucket, self.source_key).get()['Body'] # file stream # self.bytes = self.source_s3.Object(self.source_bucket, self.source_key).get()['ContentLength'] if mpu_exist: u = UploadMultiParts(mpu_to_complete, self.full_file_path, bucket, prefix, self.config, credentials) u.get_parts_information() if not self.expired: self.progress_queue.put(u.completed_bytes) seq = 1 for buffer in self.fileobj.iter_chunks( chunk_size=u.chunk_size): if seq in u.parts_completed: part = u.parts[seq - 1] u.check_md5(part, buffer) else: try: u.upload_part(buffer, seq) self.progress_queue.put(len(buffer)) # upload missing part except Exception as error: e = str(error) if "ExpiredToken" in e: self.add_back_to_queue(bucket, prefix) expired_error = True else: raise error seq += 1 if not expired_error: u.complete() self.progress_queue.put(None) else: dest_session = boto3.Session( aws_access_key_id=credentials['access_key'], aws_secret_access_key=credentials['secret_key'], aws_session_token=credentials['session_token'], region_name='us-east-1') #GB = 1024 ** 3 config = TransferConfig(multipart_threshold=8 * 1024 * 1024) self.dest = dest_session.client('s3') self.dest_bucket = bucket self.dest_key = key self.temp_key = self.dest_key + '.temp' try: self.dest.upload_fileobj( self.fileobj, self.dest_bucket, self.dest_key, Callback=self.UpdateProgress( self.progress_queue), Config=config # , # ExtraArgs={"Metadata": {"ContentLength": self.bytes}} ) except boto3.exceptions.S3UploadFailedError as error: e = str(error) if "ExpiredToken" in e: self.add_back_to_queue(bucket, prefix) else: raise error self.progress_queue.put(None) else: """ Assumes the file is being uploaded from local file system """ if mpu_exist: u = UploadMultiParts(mpu_to_complete, self.full_file_path, bucket, prefix, self.config, credentials) u.get_parts_information() if not self.expired: self.progress_queue.put(u.completed_bytes) seq = 1 with open(full_path, 'rb') as f: while True: buffer_start = u.chunk_size * (seq - 1) f.seek(buffer_start) buffer = f.read(u.chunk_size) if len(buffer) == 0: # EOF break if seq in u.parts_completed: part = u.parts[seq - 1] u.check_md5(part, buffer) else: try: u.upload_part(buffer, seq) self.progress_queue.put(len(buffer)) except Exception as error: e = str(error) if "ExpiredToken" in e: self.add_back_to_queue( bucket, prefix) expired_error = True break else: raise error seq += 1 if not expired_error: u.complete() self.progress_queue.put(None) else: if credentials: session = boto3.session.Session( aws_access_key_id=credentials['access_key'], aws_secret_access_key=credentials[ 'secret_key'], aws_session_token=credentials['session_token'], region_name='us-east-1') s3 = session.client('s3') config = TransferConfig(multipart_threshold=8 * 1024 * 1024, max_concurrency=2, num_download_attempts=10) s3_transfer = S3Transfer(s3, config) tqdm.monitor_interval = 0 try: s3_transfer.upload_file( full_path, bucket, key, callback=self.UpdateProgress( self.progress_queue)) except boto3.exceptions.S3UploadFailedError as error: e = str(error) if "ExpiredToken" in e: self.add_back_to_queue(bucket, prefix) else: raise error self.progress_queue.put(None) else: print( 'There was an error uploading {} after {} retry attempts' .format(full_path, self.upload_tries)) continue self.upload_tries = 0 self.upload = None self.upload_queue.task_done()
import argparse from botocore.exceptions import ClientError from boto3.s3.transfer import TransferConfig import zipfile import shutil import threading from queue import Queue import time s3 = boto3.client('s3') s3_resource = boto3.resource('s3') data_dir = '/tmp/' config = TransferConfig(multipart_threshold=1048576, multipart_chunksize=1048576, max_concurrency=10, num_download_attempts=10, use_threads=True) print_lock = threading.Lock() def parse_args(): parser = argparse.ArgumentParser(description="S3 compressor") parser.add_argument('--arch_bucket', help='Destination Archive bucket', required=True) parser.add_argument('--bucket', help='Source bucket', required=True) parser.add_argument('--prefix', default='sources', help='prefix') parser.add_argument('--years', help='years. ex: --years 2016,2017',
import json #import shutil import glob import logging import platform import threading from inspect import signature #import traceback import subprocess import pandas as pd #import numpy as np # imported only for np.nan import boto3 from botocore.exceptions import ClientError from boto3.s3.transfer import TransferConfig s3_config = TransferConfig(max_concurrency=20, use_threads=True) logging.basicConfig( level=logging.INFO, format='%(asctime)-12s %(levelname)-8s %(message)s') #from models.BIF import BIF from utilities import utils, logs # some basic utilities for use with s3 storage and interaction with lambdas. SimulateAWS = False # if this is true, then we do not start EC2 instance, and we use local storage to messages # to second python process all running locally to debug the communication code. # Run -op update_archives with this set, and the code will do the following: # 1. uploading has been separately tested, so we can use sample zip file already on resources/s3sim folder. # 2. ec2sim is set up to match situation that would exist in linux environment.
"clips-video", # for youtube-dl etc. "exports", "films", # for films referenced "gallery", # for DaVinci "live-audio", "live-video", "music", "stills", "table-reads", ] MULTIPART_THRESHOLD = 1024 * 1024 * 100 # 100mb MULTIPART_CHUNKSIZE = 1024 * 1024 * 100 # 100mb TRANSFER_CONFIG = TransferConfig( multipart_threshold=MULTIPART_THRESHOLD, max_concurrency=10, multipart_chunksize=MULTIPART_CHUNKSIZE, use_threads=True, ) SKIP_NONE = None SKIP_ETAG = "etag" SKIP_SIZE = "size" SKIP_LMOD = "lmod" SKIP_REGX = "regx" def _md5(filepath, blocksize=2**20): m = hashlib.md5() with open(filepath, "rb") as f: while True: buf = f.read(blocksize)
import boto3 from boto3.s3.transfer import TransferConfig GB = 1024 ** 3 config = TransferConfig(multipart_threshold=5*GB) s3 = boto3.client('s3') s3.upload_file('/home/aakash/Videos/DontBreatheHD.mp4', 'multipart-aakash', 'DontBreatheHD.mp4', Config=config) list = s3.list_buckets() print(list)
def main(): parser = argparse.ArgumentParser( prog='cta-data-relay', description='', formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter( prog, max_help_position=27, width=90)) actions_grp = parser.add_argument_group( title='Actions', description='(exactly one must be specified)') actions_mxgrp = actions_grp.add_mutually_exclusive_group(required=True) actions_mxgrp.add_argument('--local-to-s3', action='store_true', help='Upload local files to S3 storage') actions_mxgrp.add_argument('--s3-to-gridftp', action='store_true', help='Move files from S3 to gridftp storage') actions_mxgrp.add_argument('--meta-show', action='store_true', help='Show S3 metadata') actions_mxgrp.add_argument('--meta-vs-gridftp', action='store_true', help='Compare S3 metadata vs gridftp storage') actions_mxgrp.add_argument('--meta-vs-local', action='store_true', help='Compare S3 metadata vs local storage') actions_mxgrp.add_argument('--meta-set-gridftp', action='store_true', help='Set S3 metadata to match gridftp storage') actions_mxgrp.add_argument( '--meta-prune-to-gridftp', action='store_true', help='Prune from S3 metadata files not in gridftp') misc_grp = parser.add_argument_group('Miscellaneous options') misc_grp.add_argument('--local-path', metavar='PATH', help='local source file or directory') misc_grp.add_argument('--timeout', metavar='SECONDS', type=int, help='terminate after this amount of time') misc_grp.add_argument('--tempdir', metavar='PATH', default='/tmp', help='directory for (de)compression') misc_grp.add_argument('--dry-run', default=False, action='store_true', help='dry run') s3_grp = parser.add_argument_group('S3 options') s3_grp.add_argument('--s3-url', metavar='URL', default='https://rgw.icecube.wisc.edu', help='S3 endpoint URL') s3_grp.add_argument('-b', '--bucket', metavar='NAME', required=True, help='S3 bucket name') s3_grp.add_argument('-i', dest='access_key_id', help='S3 access key id') s3_grp.add_argument('-k', dest='secret_access_key', help='S3 secret access key') s3_grp.add_argument('--s3-threads', metavar='NUM', type=int, default=80, help='maximum number of S3 transfer threads') s3_grp.add_argument('--object', metavar='KEY', help='operate on specific S3 object only') s3_grp.add_argument('--s3-stats-freq', metavar='SEC', default=20, type=int, help='frequency of S3 upload progress updates') grid_grp = parser.add_argument_group('GridFTP options') grid_grp.add_argument('--gridftp-url', metavar='URL', default='gsiftp://gridftp.icecube.wisc.edu', help='GridFTP endpoint URL') grid_grp.add_argument('--gridftp-path', metavar='PATH', help='GridFTP path') grid_grp.add_argument('--gridftp-threads', metavar='NUM', type=int, default=45, help='gridftp worker pool size') args = parser.parse_args() if args.timeout: signal.alarm(args.timeout) if not os.path.isdir(args.tempdir): parser.exit(f'Invalid argument: {args.tempdir} is not a directory') s3 = boto3.resource('s3', 'us-east-1', endpoint_url=args.s3_url, aws_access_key_id=args.access_key_id, aws_secret_access_key=args.secret_access_key) bucket = s3.Bucket(args.bucket) bucket.create() compr_threads = max(1, int(os.cpu_count() / 2)) multipart_size = 2**20 if args.local_to_s3: import cta_data_relay.s3zstd tx_config = TransferConfig(max_concurrency=args.s3_threads, multipart_threshold=multipart_size, multipart_chunksize=multipart_size) if args.local_path is None: parser.exit(f'Missing required argument --local-path') if os.path.isfile(args.local_path): file_info = [(args.local_path, os.path.getsize(args.local_path))] else: file_info = [(de.path, de.stat().st_size) for de in os.scandir(args.local_path) if de.is_file()] # Sort to send small files first. This avoids the situation where # a file that is too big to be transferred within the allowed time # permanently blocks files that follow it in the list file_info.sort(key=itemgetter(1)) cta_data_relay.s3zstd.zupload(bucket, file_info, args.tempdir, compr_threads, tx_config, args.s3_stats_freq, args.dry_run) elif args.s3_to_gridftp: if args.gridftp_path is None: parser.exit(f'Missing required argument --gridftp-path') s3_to_gridftp(bucket, args.gridftp_url, args.gridftp_path, args.tempdir, args.object, args.dry_run) elif args.meta_set_gridftp: import cta_data_relay.meta if args.gridftp_path is None: parser.exit(f'Missing required argument --gridftp-path') cta_data_relay.meta.set_gridftp(bucket, args.gridftp_url, args.gridftp_path, args.gridftp_threads, args.dry_run) elif args.meta_show: import cta_data_relay.meta cta_data_relay.meta.show(bucket, args.object) elif args.meta_vs_gridftp: import cta_data_relay.meta if args.gridftp_path is None: parser.exit(f'Missing required argument --gridftp-path') cta_data_relay.meta.diff_gridftp(bucket, args.gridftp_url, args.gridftp_path, args.gridftp_threads, args.dry_run) elif args.meta_vs_local: import cta_data_relay.meta if args.local_path is None: parser.exit(f'Missing required argument --local-path') cta_data_relay.meta.diff_local(bucket, args.local_path) elif args.meta_prune_to_gridftp: import cta_data_relay.meta if args.gridftp_path is None: parser.exit(f'Missing required argument --gridftp-path') cta_data_relay.meta.prune_not_in_gridftp(bucket, args.gridftp_url, args.gridftp_path, args.dry_run) else: parser.exit('Usage error. Unexpected command.')
def download_process_data_local(start_date, bands, end_date, aws_key, aws_secret, aws_bucket_name, delta, zip_grib=False, chunksize=5, retries=5, max_workers=10): """ Download individual month directory of .grd files to local directory. This function will download using the ftplib all the .grd files between the start_date and the end_date. All dates in the NOAA NARR FTP server are stored following this order: data ├── year/month ├── year/month/day01 ├── year/month/day02 Here we download the monthly directory with the user-defined dates in the start and end dates. Params: - start_year str: year to start download. - end_year str: year to stop download. """ logger = logging.getLogger('luigi-interface') GB = 1024**3 session = boto3.Session(profile_name='default') s3 = session.client('s3') config = TransferConfig(multipart_threshold=5 * GB) base_url = 'https://nomads.ncdc.noaa.gov/data/narr' time = ['0000', '0300', '0600', '0900', '1200', '1500', '1800', '2100'] if end_date is None: end_date = start_date + relativedelta(**{'months': 1}) if not isinstance(start_date, datetime) and isinstance(start_date, str): start_date = datetime.strptime(start_date, '%Y-%m-%d') else: ValueError( f'{start_date} is not in the correct format or not a valid type') if delta is None: dates = datetime_range(start_date, end_date, {'days': 1}) else: dates = datetime_range(start_date, end_date + delta) urls_time_range = [] for day, time in product(dates, time): file_name = f'narr-a_221_{day.strftime("%Y%m%d")}_{time}_000.grb' url = URL(base_url, day.strftime('%Y%m'), day.strftime('%Y%m%d')) urls_time_range.append(str(URL(url, file_name))) with multiprocessing.Pool(max_workers) as p: results = p.map(partial(requests_to_s3, retries=retries), urls_time_range, chunksize=chunksize) if zip_grib: logger.info( f'Finish download for start_date {start_date.strftime("%Y-%m")}' ) temp_dir_grb = mkdtemp() temp_file_grb = NamedTemporaryFile() path_to_temp_file = os.path.join(temp_dir_grb, f'{temp_file_grb.name}.zip') with zipfile.ZipFile(path_to_temp_file, mode='w', compression=zipfile.ZIP_DEFLATED, compresslevel=1) as zf: for content_file_name, content_file_result in results: try: zf.writestr(content_file_name, content_file_result) except Exception as exc: logger.info(exc) else: path_to_temp_file = mkdtemp() for content_file_name, content_file_result in results: with open(os.path.join(path_to_temp_file, content_file_name), 'wb') as grb_file: grb_file.write(content_file_result) temp_dir_geo = mkdtemp() logger.info( f'Transforming GRIB to GeoTIFF using GDAL [{start_date.strftime("%Y-%m")}]' ) gdal_transform_tempfile(temp_file_path=path_to_temp_file, out_dir=temp_dir_geo, bands=bands, zip_grib=zip_grib) try: logger.info( f'Zipping GEOTiffs files and packing to upload [{start_date.strftime("%Y-%m")}]' ) temp_file_geo = NamedTemporaryFile() path_geotiffs = Path(temp_dir_geo).rglob('*.tif') with zipfile.ZipFile(f'{temp_file_geo.name}.zip', mode='w', compression=zipfile.ZIP_DEFLATED, compresslevel=1) as zip_geo: for geo_file in path_geotiffs: zip_geo.write(geo_file, geo_file.name) logger.info( f'Finish zipping - Starting upload to S3 [{start_date.strftime("%Y-%m")}]' ) key = f"processed_geotiff_wind/narr_data_{start_date.strftime('%Y_%m')}.zip" s3.upload_file(f'{temp_file_geo.name}.zip', aws_bucket_name, key, Config=config) except Exception as exc: logger.info(exc) shutil.rmtree(temp_dir_geo) shutil.rmtree(path_to_temp_file) os.remove(f'{temp_file_geo.name}.zip')
def upload_site(directory, config): if isinstance(directory, str): directory = Path(directory) if not config.get("name"): try: repo = _find_git_repo(directory) except NoGitDirectory: raise NoGitDirectory( f"From {directory} can't find its git root directory " "which is needed to supply a default branchname.") active_branch = repo.active_branch config["name"] = DEFAULT_NAME_PATTERN.format( username=getpass.getuser(), branchname=active_branch.name, date=datetime.datetime.utcnow().strftime("%Y%m%d"), ) info(f"About to upload {ppath(directory)} to {config['name']}") session = boto3.Session(profile_name=AWS_PROFILE) s3 = session.client("s3") # First make sure the bucket exists try: s3.head_bucket(Bucket=config["name"]) except ClientError as error: # If a client error is thrown, then check that it was a 404 error. # If it was a 404 error, then the bucket does not exist. if error.response["Error"]["Code"] != "404": raise # Needs to be created. bucket_config = {} if S3_DEFAULT_BUCKET_LOCATION: bucket_config["LocationConstraint"] = "us-west-1" s3.create_bucket( Bucket=config["name"], ACL="public-read", CreateBucketConfiguration=bucket_config, ) try: website_bucket = s3.get_bucket_website(Bucket=config["name"]) except ClientError as error: if error.response["Error"]["Code"] != "NoSuchWebsiteConfiguration": raise # Define the website configuration website_configuration = { "ErrorDocument": { "Key": "error.html" }, "IndexDocument": { "Suffix": "index.html" }, } website_bucket = s3.put_bucket_website( Bucket=config["name"], WebsiteConfiguration=website_configuration, # XXX Would be nice to set expiration here ) info(f"Created website bucket called {config['name']}") if config["debug"]: info(f"Website bucket: {website_bucket!r}") uploaded_already = {} if config["refresh"]: info("Refresh, so ignoring what was previously uploaded.") else: continuation_token = None while True: # Have to do this so that 'ContinuationToken' can be omitted if falsy list_kwargs = dict(Bucket=config["name"]) if continuation_token: list_kwargs["ContinuationToken"] = continuation_token response = s3.list_objects_v2(**list_kwargs) for obj in response.get("Contents", []): uploaded_already[obj["Key"]] = obj if response["IsTruncated"]: continuation_token = response["NextContinuationToken"] else: break warning(f"{len(uploaded_already):,} files already uploaded.") transfer_config = TransferConfig() skipped = [] to_upload_maybe = [] to_upload_definitely = [] for fp in directory.glob("**/*.*"): key = str(fp.relative_to(directory)) # name = str(fp) size = os.stat(fp).st_size with open(fp, "rb") as f: file_hash = hashlib.md5(f.read()).hexdigest() task = UploadTask(key, str(fp), size, file_hash) if is_junk_file(fp): skipped.append(task) continue if key not in uploaded_already or uploaded_already[key]["Size"] != size: # No doubt! We definitely didn't have this before or it's definitely # different. to_upload_definitely.append(task) else: # At this point, the key exists and the size hasn't changed. # However, for some files, that's not conclusive. # Image, a 'index.html' file might have this as its diff: # # - <script src=/foo.a9bef19a0.js></script> # + <script src=/foo.3e98ca01d.js></script> # # ...which means it definitely has changed but the file size is # exactly the same as before. # If this is the case, we're going to *maybe* upload it. # However, for files that are already digest hashed, we don't need # to bother checking. if _has_hashed_filename(key): skipped.append(task) else: to_upload_maybe.append(task) T0 = time.time() futures = {} total_threadpool_time = [] uploaded = {} with concurrent.futures.ThreadPoolExecutor( max_workers=MAX_WORKERS_PARALLEL_UPLOADS) as executor: if to_upload_maybe: info("About to consider " f"{len(to_upload_maybe):,} files") if to_upload_definitely: info("About to upload " f"{len(to_upload_definitely):,} files") bucket_name = config["name"] for list_, check_hash_first in ( (to_upload_definitely, False), (to_upload_maybe, True), ): for task in list_: futures[executor.submit( _upload_file_maybe, s3, task, bucket_name, transfer_config, check_hash_first, )] = task for future in concurrent.futures.as_completed(futures): was_uploaded, took = future.result() task = futures[future] uploaded[task] = (was_uploaded, took) total_threadpool_time.append(took) T1 = time.time() actually_uploaded = [k for k, v in uploaded.items() if v[0]] actually_skipped = [k for k, v in uploaded.items() if not v[0]] if skipped or actually_skipped: warning( f"Skipped uploading {len(skipped) + len(actually_skipped):,} files" ) if uploaded: if actually_uploaded: total_uploaded_size = sum([x.size for x in actually_uploaded]) success(f"Uploaded {len(actually_uploaded):,} " f"{'file' if len(actually_uploaded) == 1 else 'files'} " f"(totalling {fmt_size(total_uploaded_size)}) " f"(~{fmt_size(total_uploaded_size / 60)}/s)") if total_threadpool_time: info("Sum of time to upload in thread pool " f"{fmt_seconds(sum(total_threadpool_time))}") success(f"Done in {fmt_seconds(T1 - T0)}") return {"uploaded": uploaded, "skipped": skipped, "took": T1 - T0}
# Threads are used by default in the managed transfer methods. To ensure no threads are used in the transfer process, # set use_threads to False. Note that in setting use_threads to False, the value for max_concurrency is ignored as the # main thread will only ever be used: import boto3 from boto3.s3.transfer import TransferConfig s3_bucket = "test-uala" s3_file="tmp.txt.large" # Get the service client s3 = boto3.client('s3') # Ensure that no threads are used. config = TransferConfig(use_threads=False) # Download object at s3_bucket with key-name to tmp.txt with the # set configuration s3.download_file(s3_bucket, s3_file, s3_file, Config=config)
def execute(self): """Executes the fetch operation. This is different to the DB API as it returns an iterable. Of course we could model that API more precisely in future. :return: An iterable of the records fetched """ # print("Executing select_object_content") self.timer.start() if not self.need_s3select: proj_dir = os.environ['PYTHONPATH'].split(":")[0] table_loc = os.path.join(proj_dir, TABLE_STORAGE_LOC) if not os.path.exists(table_loc): os.makedirs(table_loc) self.table_local_file_path = os.path.join(table_loc, self.s3key) if not os.path.exists( self.table_local_file_path) or not USE_CACHED_TABLES: config = TransferConfig(multipart_chunksize=8 * MB, multipart_threshold=8 * MB) self.table_data = io.BytesIO() self.s3.download_fileobj(Bucket=S3_BUCKET_NAME, Key=self.s3key, Fileobj=self.table_data, Config=config) self.num_http_get_requests = Cursor.calculate_num_http_requests( self.table_data, config) return self.parse_file() else: # Note: # # CSV files use | as a delimiter and have a trailing delimiter so record delimiter is |\n # # NOTE: As responses are chunked the file headers are only returned in the first chunk. # We ignore them for now just because its simpler. It does mean the records are returned as a list # instead of a dict though (can change in future). # response = self.s3.select_object_content( Bucket=S3_BUCKET_NAME, Key=self.s3key, ExpressionType='SQL', Expression=self.s3sql, InputSerialization={ 'CSV': { 'FileHeaderInfo': 'Use', 'RecordDelimiter': '|\n', 'FieldDelimiter': '|' } }, OutputSerialization={'CSV': {}}) self.event_stream = response['Payload'] self.num_http_get_requests = 1 return self.parse_event_stream()
def upload_file(file_name, bucket, object_name=None): """Upload a file to an S3 bucket :param file_name: File to upload :param bucket: Bucket to upload to :param object_name: S3 object name. If not specified then file_name is used :return: True if file was uploaded, else False """ # If S3 object_name was not specified, use file_name if object_name is None: object_name = file_name credentials = {'aws_access_key_id': os.getenv('AWS_SERVER_PUBLIC_KEY'), 'aws_secret_access_key': os.getenv('AWS_SERVER_SECRET_KEY'), 'region_name': os.getenv('REGION_NAME') } # Upload the file s3_client = boto3.client('s3', **credentials, config=Config(signature_version='s3v4')) transfer_config = TransferConfig(multipart_threshold=1024 * 25, max_concurrency=10, multipart_chunksize=1024 * 25, use_threads=True) try: # print('aaa', file_name) print(bcolors.WARNING + 'Start upload: ' + bcolors.ENDC + os.path.split(file_name)[1]) print(bcolors.WARNING + 'Renamed to: ' + bcolors.ENDC + os.path.split(object_name)[1]) response = s3_client.upload_file(file_name, bucket, object_name, ExtraArgs={'ACL': 'public-read'}, Config=transfer_config, Callback=ProgressPercentage(file_name) ) except ClientError as e: print('Error!!!') print(e) return False # https://stackoverflow.com/questions/33809592/upload-to-amazon-s3-using-boto3-and-return-public-url # generate link 1 # link = s3_client.generate_presigned_url('get_object', ExpiresIn=7776000, Params={'Bucket': S3_Bucket, 'Key': object_name}) # generate link 2 # import boto3 # s3_client = boto3.client # bucket_location = s3_client.get_bucket_location(Bucket='my_bucket_name') # url = "https://s3.{0}.amazonaws.com/{1}/{2}".format(bucket_location['LocationConstraint'], 'my_bucket_name', # quote_plus('2018-11-26 16:34:48.351890+09:00.jpg') # print(url) # generate link 3 link = '%s/%s/%s' % (s3_client.meta.endpoint_url, bucket, object_name) print('\n' + bcolors.WARNING + 'File link: ' + link + bcolors.ENDC) pyperclip.copy(link) spam = pyperclip.paste() print('\n' + bcolors.OKGREEN + 'Ссылка скопирована в буфер обмена.') return True
def __init__(self, s3_client: boto3.session.Session.client) -> None: megabyte = 1024**2 gigabyte = 1024**3 self.s3_client = s3_client self.s3_config = TransferConfig(io_chunksize=4 * megabyte, multipart_threshold=4 * gigabyte)
def copy(self, CopySource, Bucket, Key, ExtraArgs=None, Callback=None, SourceClient=None, Config=None): """Copy an object from one S3 location to another. This is a managed transfer which will perform a multipart copy in multiple threads if necessary. Usage:: import boto3 s3 = boto3.resource('s3') copy_source = { 'Bucket': 'mybucket', 'Key': 'mykey' } s3.meta.client.copy(copy_source, 'otherbucket', 'otherkey') :type CopySource: dict :param CopySource: The name of the source bucket, key name of the source object, and optional version ID of the source object. The dictionary format is: ``{'Bucket': 'bucket', 'Key': 'key', 'VersionId': 'id'}``. Note that the ``VersionId`` key is optional and may be omitted. :type Bucket: str :param Bucket: The name of the bucket to copy to :type Key: str :param Key: The name of the key to copy to :type ExtraArgs: dict :param ExtraArgs: Extra arguments that may be passed to the client operation :type Callback: method :param Callback: A method which takes a number of bytes transferred to be periodically called during the copy. :type SourceClient: botocore or boto3 Client :param SourceClient: The client to be used for operation that may happen at the source object. For example, this client is used for the head_object that determines the size of the copy. If no client is provided, the current client is used as the client for the source object. :type Config: boto3.s3.transfer.TransferConfig :param Config: The transfer configuration to be used when performing the copy. """ subscribers = None if Callback is not None: subscribers = [ProgressCallbackInvoker(Callback)] config = Config if config is None: config = TransferConfig() with create_transfer_manager(self, config) as manager: future = manager.copy( copy_source=CopySource, bucket=Bucket, key=Key, extra_args=ExtraArgs, subscribers=subscribers, source_client=SourceClient) return future.result()
def zip_campaign_files(): # pylint: disable=too-many-locals """Archive and publish all test campaign data to the S3 repository. It allows collecting all the artifacts from the S3 repository. It could be overriden if the common implementation is not suitable. The credentials must be configured before publishing the artifacts: * fill ~/.aws/credentials or ~/.boto, * set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY in env. The next vars must be set in env: * S3_ENDPOINT_URL (http://127.0.0.1:9000), * S3_DST_URL (s3://xtesting/prefix), Returns: Campaign.EX_OK if artifacts were published to repository. Campaign.EX_DUMP_ARTIFACTS_ERROR otherwise. """ try: build_tag = env.get('BUILD_TAG') assert Campaign.dump_db() == Campaign.EX_OK assert Campaign.dump_artifacts() == Campaign.EX_OK with zipfile.ZipFile(f'{build_tag}.zip', 'w', zipfile.ZIP_DEFLATED) as zfile: zfile.write(f"{build_tag}.json") for root, _, files in os.walk(build_tag): for filename in files: zfile.write(os.path.join(root, filename)) b3resource = boto3.resource( 's3', endpoint_url=os.environ["S3_ENDPOINT_URL"]) dst_s3_url = os.environ["S3_DST_URL"] multipart_threshold = 5 * 1024**5 if "google" in os.environ[ "S3_ENDPOINT_URL"] else 8 * 1024 * 1024 tconfig = TransferConfig(multipart_threshold=multipart_threshold) bucket_name = urllib.parse.urlparse(dst_s3_url).netloc mime_type = mimetypes.guess_type(f'{build_tag}.zip') path = urllib.parse.urlparse(dst_s3_url).path.strip("/") # pylint: disable=no-member b3resource.Bucket(bucket_name).upload_file( f'{build_tag}.zip', os.path.join(path, f'{build_tag}.zip'), Config=tconfig, ExtraArgs={ 'ContentType': mime_type[0] or 'application/octet-stream' }) dst_http_url = os.environ["HTTP_DST_URL"] link = os.path.join(dst_http_url, f'{build_tag}.zip') Campaign.__logger.info( "All data were successfully published:\n\n%s", link) return Campaign.EX_OK except KeyError as ex: Campaign.__logger.error("Please check env var: %s", str(ex)) return Campaign.EX_ZIP_CAMPAIGN_FILES_ERROR except botocore.exceptions.NoCredentialsError: Campaign.__logger.error( "Please fill ~/.aws/credentials, ~/.boto or set " "AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY in env") return Campaign.EX_ZIP_CAMPAIGN_FILES_ERROR except Exception: # pylint: disable=broad-except Campaign.__logger.exception("Cannot publish the artifacts") return Campaign.EX_ZIP_CAMPAIGN_FILES_ERROR
from itertools import groupby from operator import attrgetter from . import _util from .exceptions import BucketStorageUnavailableException from .storage import Storage # Max size in bytes before uploading in parts. AWS_UPLOAD_MAX_SIZE = 8 * 1024 * 1024 # Size of parts when uploading in parts AWS_UPLOAD_PART_SIZE = 8 * 1024 * 1024 s3_multipart_config = TransferConfig( multipart_threshold=AWS_UPLOAD_MAX_SIZE, multipart_chunksize=AWS_UPLOAD_PART_SIZE, max_concurrency=10, num_download_attempts=10, ) class Bucket(Storage): """Represents a resource/result bucket. This class is the interface to manage resources or results from a :class:`qarnot.bucket.Bucket`. .. note:: A :class:`Bucket` must be created with :meth:`qarnot.connection.Connection.create_bucket` or retrieved with :meth:`qarnot.connection.Connection.buckets`, :meth:`qarnot.connection.Connection.retrieve_bucket`, or :meth:`qarnot.connection.Connection.retrieve_or_create_bucket`.
import logging import boto3 from boto3.s3.transfer import TransferConfig MB = 1024**2 GB = 1024**3 logging.basicConfig(format=f'%(asctime)s %(levelname)s %(message)s', level=logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger('S3upload') logger.propagate = False ch = logging.StreamHandler() ch.setFormatter(formatter) ch.setLevel(logging.ERROR) logger.addHandler(ch) CONFIG = TransferConfig(multipart_threshold=GB) class ProgressTracker(object): def __init__(self): self._size = 0 self._numfiles = 0 self._seen_so_far = 0 self.completed = 0 self.failed = 0 self._lock = threading.Lock() def trackfile(self, fpath): fpath = pathlib.Path(fpath) with self._lock: self._size += fpath.stat().st_size
def upload_file( data_file: str, meta: 'SnowflakeFileMeta', encryption_metadata: 'EncryptionMetadata', max_concurrency: int, multipart_threshold: int, ): """Uploads the local file to S3. Args: data_file: File path on local system. meta: The File meta object (contains credentials and remote location). encryption_metadata: Encryption metadata to be set on object. max_concurrency: The maximum number of threads to used to upload. multipart_threshold: The number of bytes after which size a file should be uploaded concurrently in chunks. Raises: HTTPError if some http errors occurred. Returns: None. """ try: s3_metadata = { HTTP_HEADER_CONTENT_TYPE: HTTP_HEADER_VALUE_OCTET_STREAM, SFC_DIGEST: meta.sha256_digest, } if encryption_metadata: s3_metadata.update({ AMZ_IV: encryption_metadata.iv, AMZ_KEY: encryption_metadata.key, AMZ_MATDESC: encryption_metadata.matdesc, }) s3location = SnowflakeS3Util.extract_bucket_name_and_path( meta.client_meta.stage_info['location']) s3path = s3location.s3path + meta.dst_file_name.lstrip('/') akey = meta.client_meta.cloud_client.Object( s3location.bucket_name, s3path) extra_args = {'Metadata': s3_metadata} config = TransferConfig( multipart_threshold=multipart_threshold, max_concurrency=max_concurrency, num_download_attempts=10, ) if meta.src_stream is None: akey.upload_file( data_file, Callback=meta.put_callback( data_file, os.path.getsize(data_file), output_stream=meta.put_callback_output_stream, show_progress_bar=meta.show_progress_bar) if meta.put_callback else None, ExtraArgs=extra_args, Config=config) else: upload_stream = meta.real_src_stream or meta.src_stream upload_size = upload_stream.seek(0, os.SEEK_END) upload_stream.seek(0) akey.upload_fileobj( upload_stream, Callback=meta.put_callback( data_file, upload_size, output_stream=meta.put_callback_output_stream, show_progress_bar=meta.show_progress_bar) if meta.put_callback else None, ExtraArgs=extra_args, Config=config, ) logger.debug('DONE putting a file') meta.dst_file_size = meta.upload_size meta.result_status = ResultStatus.UPLOADED except botocore.exceptions.ClientError as err: if err.response['Error']['Code'] == EXPIRED_TOKEN: logger.debug("AWS Token expired. Renew and retry") meta.result_status = ResultStatus.RENEW_TOKEN return logger.debug(f"Failed to upload a file: {data_file}, err: {err}", exc_info=True) raise err except S3UploadFailedError as err: if EXPIRED_TOKEN in str(err): # Since AWS token expiration error can be encapsulated in # S3UploadFailedError, the text match is required to # identify the case. logger.debug( f'Failed to upload a file: {data_file}, err: {err}. Renewing AWS Token and Retrying' ) meta.result_status = ResultStatus.RENEW_TOKEN return meta.last_error = err meta.result_status = ResultStatus.NEED_RETRY except OpenSSL.SSL.SysCallError as err: meta.last_error = err if err.args[0] == ERRORNO_WSAECONNABORTED: # connection was disconnected by S3 # because of too many connections. retry with # less concurrency to mitigate it meta.result_status = ResultStatus.NEED_RETRY_WITH_LOWER_CONCURRENCY else: meta.result_status = ResultStatus.NEED_RETRY
def upload_file(data_file, meta, encryption_metadata, max_concurrency): logger = getLogger(__name__) try: s3_metadata = { HTTP_HEADER_CONTENT_TYPE: HTTP_HEADER_VALUE_OCTET_STREAM, SFC_DIGEST: meta[SHA256_DIGEST], } if (encryption_metadata): s3_metadata.update({ AMZ_IV: encryption_metadata.iv, AMZ_KEY: encryption_metadata.key, AMZ_MATDESC: encryption_metadata.matdesc, }) s3location = SnowflakeS3Util.extract_bucket_name_and_path( meta[u'stage_info'][u'location']) s3path = s3location.s3path + meta[u'dst_file_name'].lstrip('/') akey = meta[u'client'].Object(s3location.bucket_name, s3path) akey.upload_file( data_file, Callback=meta[u'put_callback']( data_file, os.path.getsize(data_file), output_stream=meta[u'put_callback_output_stream'], show_progress_bar=meta[u'show_progress_bar']) if meta[u'put_callback'] else None, ExtraArgs={ u'Metadata': s3_metadata, }, Config=TransferConfig( multipart_threshold=SnowflakeS3Util.DATA_SIZE_THRESHOLD, max_concurrency=max_concurrency, num_download_attempts=10, )) logger.debug(u'DONE putting a file') meta[u'dst_file_size'] = meta[u'upload_size'] meta[u'result_status'] = ResultStatus.UPLOADED except botocore.exceptions.ClientError as err: if err.response[u'Error'][u'Code'] == EXPIRED_TOKEN: logger.debug(u"AWS Token expired. Renew and retry") meta[u'result_status'] = ResultStatus.RENEW_TOKEN return logger.debug(u"Failed to upload a file: %s, err: %s", data_file, err, exc_info=True) raise err except S3UploadFailedError as err: if EXPIRED_TOKEN in TO_UNICODE(err): # Since AWS token expiration error can be encapsulated in # S3UploadFailedError, the text match is required to # identify the case. logger.debug( 'Failed to upload a file: %s, err: %s. Renewing ' 'AWS Token and Retrying', data_file, err) meta[u'result_status'] = ResultStatus.RENEW_TOKEN return meta[u'last_error'] = err meta[u'result_status'] = ResultStatus.NEED_RETRY except OpenSSL.SSL.SysCallError as err: meta[u'last_error'] = err if err.args[0] == ERRORNO_WSAECONNABORTED: # connection was disconnected by S3 # because of too many connections. retry with # less concurrency to mitigate it meta[ u'result_status'] = ResultStatus.NEED_RETRY_WITH_LOWER_CONCURRENCY else: meta[u'result_status'] = ResultStatus.NEED_RETRY
def s3_download(bucket, s3_filepath, local_filepath, profile_name='default', region_name='us-west-2', multipart_threshold=8388608, multipart_chunksize=8388608): """ Downloads a file or collection of files from S3 Parameters ---------- bucket : str name of S3 bucket s3_filepath : str or list path and filename within bucket to file(s) you would like to download local_filepath : str or list path and filename for file(s) to be saved locally profile_name : str profile name for credentials (default 'default' or organization-specific) region_name : str name of AWS region (default value 'us-west-2') multipart_threshold : int minimum file size to initiate multipart download multipart_chunksize : int chunksize for multipart download Returns ------- None Example use ----------- # Downloading a single file from S3: s3_download( bucket='my_bucket', s3_filepath='tmp/my_file.csv', local_filepath='../data/my_file.csv') # Downloading with a profile name: s3_download( bucket='my_bucket', profile_name='my-profile-name', s3_filepath='tmp/my_file.csv', local_filepath='../data/my_file.csv') # Downloading a list of files from S3 (will not upload contents of subdirectories): s3_download( bucket='my_bucket', s3_filepath=['tmp/my_file1.csv', 'tmp/my_file2.csv', 'img.png'], local_filepath=['../data/my_file1.csv', '../data/my_file2.csv', '../img.png']) # Downloading files matching a pattern from S3 (will not upload contents of subdirectories): s3_download( bucket='my_bucket', s3_filepath='tmp/*.csv', local_filepath='../data/') # Downloading all files in a directory from S3 (will not upload contents of subdirectories): s3_download( bucket='my_bucket', s3_filepath='tmp/*', local_filepath='../data/') """ # validate s3_filepath and local_filepath arguments _download_upload_filepath_validator(s3_filepath=s3_filepath, local_filepath=local_filepath) # create bucket object my_bucket = s3_get_bucket(bucket=bucket, profile_name=profile_name, region_name=region_name) # multipart_threshold and multipart_chunksize, defaults = Amazon defaults config = TransferConfig(multipart_threshold=multipart_threshold, multipart_chunksize=multipart_chunksize) if isinstance(s3_filepath, str): # find keys matching wildcard if '*' in s3_filepath: s3_filepath = _s3_glob(s3_filepath=s3_filepath, my_bucket=my_bucket) local_filepath = [ os.path.join(local_filepath, key.split('/')[-1]) for key in s3_filepath ] # insert into list so same looping structure can be used else: s3_filepath = [s3_filepath] local_filepath = [local_filepath] # download all files from S3 for s3_key, local_file in zip(s3_filepath, local_filepath): try: my_bucket.download_file(s3_key, local_file, Config=config) except ClientError as e: error_code = int(e.response['Error']['Code']) if error_code == 400: raise NameError('The credentials are expired or not valid. ' + str(e)) else: raise e return
def multi_part_upload_with_s3(filename=None, key_path=None, bucket=None, upload_type="single"): start_time = default_timer() bucket_name_prefix = "prep-logs" key, sec = catchMeIfYouCan(aawwss_text) aaa_env, sss_env = catchMeIfYouCan(aawwss_env) os.environ[aaa_env] = key os.environ[sss_env] = sec if bucket is None or bucket == "": BUCKET_NAME = f"{bucket_name_prefix}-kr" else: BUCKET_NAME = f"{bucket_name_prefix}{bucket}" cprint(f"\t bucket {bucket} -> {BUCKET_NAME}") if args.verbose else False if bucket == "-hk": s3 = boto3.resource('s3', region_name="ap-east-1") else: s3 = boto3.resource('s3', ) ##single parts if upload_type == "single": s3.meta.client.meta.events.register('choose-signer.s3.*', disable_signing) # config = TransferConfig(use_threads=True, multipart_threshold=1024*1024*8, multipart_chunksize=1024*1024*8) config = TransferConfig(multipart_threshold=838860800, max_concurrency=10, multipart_chunksize=8388608, num_download_attempts=5, max_io_queue=100, io_chunksize=262144, use_threads=True) # multiparts mode -> AWS S3 CLI: Anonymous users cannot initiate multipart uploads elif upload_type == "multi": pass config = TransferConfig(multipart_threshold=1024 * 25, max_concurrency=10, multipart_chunksize=1024 * 25, use_threads=True) else: cprint(f"Unknown upload_type-> {upload_type}", "red") if filename is None: cprint(f"[ERROR] filename is None", "red") raise SystemExit() if key_path is None: key_path = filename try: s3.meta.client.upload_file( filename, BUCKET_NAME, key_path, # ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/pdf'}, Config=config, Callback=ProgressPercentage(filename)) except Exception as e: e = str(e).replace(":", ":\n") cprint(f"\n[ERROR] File upload fail / cause->{e}\n", "red") raise SystemExit() elapsed = default_timer() - start_time time_completed_at = "{:5.3f}s".format(elapsed) cprint(f"\n\t time_completed_at = {time_completed_at}")
def stream_time_range_s3(start_date, end_date, aws_key, aws_secret, aws_bucket_name, key, delta, max_workers=10): """ Download individual month directory of .grd files to local directory. This function will download using the ftplib all the .grd files between the start_date and the end_date. All dates in the NOAA NARR FTP server are stored following this order: data ├── year/month ├── year/month/day01 ├── year/month/day02 Here we download the monthly directory with the user-defined dates in the start and end dates. Params: - start_year str: year to start download. - end_year str: year to stop download. """ GB = 1024**3 session = boto3.Session(profile_name='default') s3 = session.client('s3') config = TransferConfig(multipart_threshold=5 * GB) base_url = 'https://nomads.ncdc.noaa.gov/data/narr' time = ['0000', '0300', '0600', '0900', '1200', '1500', '1800', '2100'] if not isinstance(start_date, datetime) and isinstance(start_date, str): start_date = datetime.strptime(start_date, '%Y-%m-%d') else: ValueError( f'{start_date} is not in the correct format or not a valid type') if delta is None: dates = datetime_range(start_date, end_date, {'days': 1}) else: dates = datetime_range(start_date, end_date + delta) urls_time_range = [] for day, time in product(dates, time): file_name = f'narr-a_221_{day.strftime("%Y%m%d")}_{time}_000.grb' url = URL(base_url, day.strftime('%Y%m'), day.strftime('%Y%m%d')) urls_time_range.append(str(URL(url, file_name))) with multiprocessing.Pool(max_workers) as p: results = p.map(requests_to_s3, urls_time_range, chunksize=1) logger.info('Finish download') temp_dir = mkdtemp() temp_file = NamedTemporaryFile() path_to_temp_file = os.path.join(temp_dir, temp_file.name) with zipfile.ZipFile(path_to_temp_file, mode='w', compression=zipfile.ZIP_DEFLATED, compresslevel=1) as zf: for content_file_name, content_file_result in results: try: zf.writestr(content_file_name, content_file_result) except Exception as exc: print(exc) logger.info('Finish zipping - Upload Start') s3.upload_file(path_to_temp_file, aws_bucket_name, key, Config=config) return path_to_temp_file
def source_dataset(): source_dataset_url = get_dataset_url() response = None retries = 5 for attempt in range(retries): try: response = urlopen(source_dataset_url) except HTTPError as e: if attempt == retries: raise Exception('HTTPError: ', e.code) time.sleep(0.2 * attempt) except URLError as e: if attempt == retries: raise Exception('URLError: ', e.reason) time.sleep(0.2 * attempt) else: break if response is None: raise Exception('There was an issue downloading the dataset') data_set_name = os.environ['DATA_SET_NAME'] data_dir = '/tmp' if not os.path.exists(data_dir): os.mkdir(data_dir) file_location = os.path.join(data_dir, data_set_name+'.csv') s3_bucket = os.environ['S3_BUCKET'] s3 = boto3.client('s3') s3_resource = boto3.resource('s3') config = TransferConfig(multipart_threshold=1024*25, max_concurrency=10, multipart_chunksize=1024*25, use_threads=True) s3_uploads = [] asset_list = [] obj_name = file_location.split('/', 3).pop().replace(' ', '_').lower() file_location = os.path.join(data_dir, obj_name) new_s3_key = data_set_name + '/dataset/' + obj_name filedata = response.read() has_changes = md5_compare(s3, s3_bucket, new_s3_key, BytesIO(filedata)) if has_changes: s3_resource.Object(s3_bucket, new_s3_key).put(Body=filedata) print('Uploaded: ' + file_location) else: print('No changes in: ' + file_location) asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key} s3_uploads.append({'has_changes': has_changes, 'asset_source': asset_source}) count_updated_data = sum( upload['has_changes'] == True for upload in s3_uploads) if count_updated_data > 0: asset_list = list( map(lambda upload: upload['asset_source'], s3_uploads)) if len(asset_list) == 0: raise Exception('Something went wrong when uploading files to s3') # asset_list is returned to be used in lamdba_handler function # if it is empty, lambda_handler will not republish return asset_list