def _multipart_upload(bucketname, aws_key, aws_secret, source_path, keyname, reduced, debug, cb, num_cb, acl='private', headers={}, guess_mimetype=True, parallel_processes=4): """ Parallel multipart upload. """ conn = connect_s3(aws_key, aws_secret) conn.debug = debug bucket = conn.get_bucket(bucketname) if guess_mimetype: mtype = mimetypes.guess_type(keyname)[0] or 'application/octet-stream' headers.update({'Content-Type': mtype}) mp = bucket.initiate_multipart_upload(keyname, headers=headers, reduced_redundancy=reduced) source_size = os.stat(source_path).st_size bytes_per_chunk = max(int(math.sqrt(5242880) * math.sqrt(source_size)), 5242880) chunk_amount = int(math.ceil(source_size / float(bytes_per_chunk))) pool = Pool(processes=parallel_processes) for i in range(chunk_amount): offset = i * bytes_per_chunk remaining_bytes = source_size - offset bytes = min([bytes_per_chunk, remaining_bytes]) part_num = i + 1 pool.apply_async(_upload_part, [ bucketname, aws_key, aws_secret, mp.id, part_num, source_path, offset, bytes, debug, cb, num_cb ]) pool.close() pool.join() if len(mp.get_all_parts()) == chunk_amount: mp.complete_upload() key = bucket.get_key(keyname) key.set_acl(acl) else: mp.cancel_upload()
def fetch_path(key_path=None, bucket_name=None, overwrite=0, aws_access_key_id=None, aws_secret_access_key=None): """ Fetches a path from an S3 bucket If the key in the s3 bucket contains slashes, interpret as a file tree and replicate it locally """ prefix = os.getcwd() + '/' key_prefix = '' overwrite = int(overwrite) cb = _progress_cb num_cb = 100 conn = connect_s3(aws_access_key_id, aws_secret_access_key) b = conn.get_bucket(bucket_name) remote_keys = [k for k in b.list(key_path)] if len(remote_keys) == 0: print 'No files matching path in bucket' sys.exit(0) for k in remote_keys: filename = _get_local_path(k.key, prefix, key_prefix) filedir = os.path.dirname(filename) # try to make the directory try: os.makedirs(filedir) except OSError as e: if e.errno != errno.EEXIST: raise if os.path.exists(filename): if not overwrite: print 'File {} already exists. Skipping'.format(filename) continue print 'File {} already exists. Overwriting'.format(filename) print 'Retrieving {} to {}'.format(k, filename) outfile = open(filename, 'w') k.get_contents_to_file(outfile, cb=cb, num_cb=num_cb)
def _upload(retries_left=amount_of_retries): try: if debug == 1: print 'Start uploading part #%d ...' % part_num conn = connect_s3(aws_key, aws_secret) conn.debug = debug bucket = conn.get_bucket(bucketname) for mp in bucket.get_all_multipart_uploads(): if mp.id == multipart_id: with FileChunkIO(source_path, 'r', offset=offset, bytes=bytes) as fp: mp.upload_part_from_file(fp=fp, part_num=part_num, cb=cb, num_cb=num_cb) break except Exception, exc: if retries_left: _upload(retries_left=retries_left - 1) else: print 'Failed uploading part #%d' % part_num raise exc
def _multipart_upload(bucketname, aws_key, aws_secret, source_path, keyname, reduced, debug, cb, num_cb, acl='private', headers={}, guess_mimetype=True, parallel_processes=4): """ Parallel multipart upload. """ conn = connect_s3(aws_key, aws_secret) conn.debug = debug bucket = conn.get_bucket(bucketname) if guess_mimetype: mtype = mimetypes.guess_type(keyname)[0] or 'application/octet-stream' headers.update({'Content-Type': mtype}) mp = bucket.initiate_multipart_upload(keyname, headers=headers, reduced_redundancy=reduced) source_size = os.stat(source_path).st_size bytes_per_chunk = max(int(math.sqrt(5242880) * math.sqrt(source_size)), 5242880) chunk_amount = int(math.ceil(source_size / float(bytes_per_chunk))) pool = Pool(processes=parallel_processes) for i in range(chunk_amount): offset = i * bytes_per_chunk remaining_bytes = source_size - offset bytes = min([bytes_per_chunk, remaining_bytes]) part_num = i + 1 pool.apply_async(_upload_part, [bucketname, aws_key, aws_secret, mp.id, part_num, source_path, offset, bytes, debug, cb, num_cb]) pool.close() pool.join() if len(mp.get_all_parts()) == chunk_amount: mp.complete_upload() key = bucket.get_key(keyname) key.set_acl(acl) else: mp.cancel_upload()
def put_path(path=None, bucket_name=None, overwrite=0, aws_access_key_id=None, aws_secret_access_key=None): """ Puts a path to S3 If the path is a file, puts just the file into the bucket If the path is a folder, recursively puts the folder into the bucket """ if bucket_name is None: print 'You must provide a bucket name' sys.exit(0) cb = _progress_cb num_cb = 100 debug = 0 reduced = True grant = None headers = {} overwrite = int(overwrite) conn = connect_s3(aws_access_key_id, aws_secret_access_key) b = conn.get_bucket(bucket_name) path = _expand_path(path) files_to_check_for_upload = [] existing_keys_to_check_against = [] prefix = os.getcwd() + '/' key_prefix = '' # Take inventory of the files to upload # For directories, walk recursively files_in_bucket = [k.name for k in b.list()] if os.path.isdir(path): print 'Getting list of existing keys to check against' for root, dirs, files in os.walk(path): for p in files: if p.startswith("."): continue full_path = os.path.join(root, p) key_name = _get_key_name(full_path, prefix, key_prefix) files_to_check_for_upload.append(full_path) if key_name in files_in_bucket: existing_keys_to_check_against.append(full_path) # for single files, just add the file elif os.path.isfile(path): full_path = os.path.abspath(path) key_name = _get_key_name(full_path, prefix, key_prefix) files_to_check_for_upload.append(full_path) if key_name in files_in_bucket: existing_keys_to_check_against.append(full_path) # we are trying to upload something unknown else: print "I don't know what %s is, so i can't upload it" % path print "{} files to upload:".format(len(files_to_check_for_upload)) pprint(files_to_check_for_upload) print "{} Existing files already in bucket:".format(len(existing_keys_to_check_against)) pprint(existing_keys_to_check_against) for full_path in files_to_check_for_upload: key_name = _get_key_name(full_path, prefix, key_prefix) if full_path in existing_keys_to_check_against: if not overwrite and b.get_key(key_name): print 'Skipping %s as it exists in s3' % full_path continue print 'Copying %s to %s/%s' % (full_path, bucket_name, key_name) # 0-byte files don't work and also don't need multipart upload if os.stat(full_path).st_size != 0 and multipart_capable: _multipart_upload(bucket_name, aws_access_key_id, aws_secret_access_key, full_path, key_name, reduced, debug, cb, num_cb, grant or 'private', headers) else: _singlepart_upload(b, key_name, full_path, cb=cb, num_cb=num_cb, policy=grant, reduced_redundancy=reduced, headers=headers)
def put_path(path=None, bucket_name=None, overwrite=0, aws_access_key_id=None, aws_secret_access_key=None): """ Puts a path to S3 If the path is a file, puts just the file into the bucket If the path is a folder, recursively puts the folder into the bucket """ if bucket_name is None: print 'You must provide a bucket name' sys.exit(0) cb = _progress_cb num_cb = 100 debug = 0 reduced = True grant = None headers = {} overwrite = int(overwrite) conn = connect_s3(aws_access_key_id, aws_secret_access_key) b = conn.get_bucket(bucket_name) path = _expand_path(path) files_to_check_for_upload = [] existing_keys_to_check_against = [] prefix = os.getcwd() + '/' key_prefix = '' # Take inventory of the files to upload # For directories, walk recursively files_in_bucket = [k.name for k in b.list()] if os.path.isdir(path): print 'Getting list of existing keys to check against' for root, dirs, files in os.walk(path): for p in files: if p.startswith("."): continue full_path = os.path.join(root, p) key_name = _get_key_name(full_path, prefix, key_prefix) files_to_check_for_upload.append(full_path) if key_name in files_in_bucket: existing_keys_to_check_against.append(full_path) # for single files, just add the file elif os.path.isfile(path): full_path = os.path.abspath(path) key_name = _get_key_name(full_path, prefix, key_prefix) files_to_check_for_upload.append(full_path) if key_name in files_in_bucket: existing_keys_to_check_against.append(full_path) # we are trying to upload something unknown else: print "I don't know what %s is, so i can't upload it" % path print "{} files to upload:".format(len(files_to_check_for_upload)) pprint(files_to_check_for_upload) print "{} Existing files already in bucket:".format( len(existing_keys_to_check_against)) pprint(existing_keys_to_check_against) for full_path in files_to_check_for_upload: key_name = _get_key_name(full_path, prefix, key_prefix) if full_path in existing_keys_to_check_against: if not overwrite and b.get_key(key_name): print 'Skipping %s as it exists in s3' % full_path continue print 'Copying %s to %s/%s' % (full_path, bucket_name, key_name) # 0-byte files don't work and also don't need multipart upload if os.stat(full_path).st_size != 0 and multipart_capable: _multipart_upload(bucket_name, aws_access_key_id, aws_secret_access_key, full_path, key_name, reduced, debug, cb, num_cb, grant or 'private', headers) else: _singlepart_upload(b, key_name, full_path, cb=cb, num_cb=num_cb, policy=grant, reduced_redundancy=reduced, headers=headers)