def ls(self, s3prefix, return_full_urls=False, require_s3_scheme=False, shallow=False, followlinks=False, list_versions=False): ''' List files on AWS S3 prefix is given as an S3 url: ``s3://bucket-name/path/to/dir``. It will return all values in the bucket that have that prefix. Note that ``/dir/filename.ext`` is found by ``ls('s3://bucket-name/dir/fil')``; it's really a prefix and not a directory name. A local prefix generally is acceptable, but if require_s3_scheme is True, the prefix must be an s3 URL. If `shallow` is `True`, the key names are processed hierarchically using '/' as a delimiter, and only the immediate "children" are returned. ''' import six k = path.parse(s3prefix) if k.scheme == 's3': prefix = k.path if prefix.startswith(path.sep): prefix = prefix[len(path.sep):] delimiter = shallow and path.sep or '' if return_full_urls: clean_paths = lambda x: "s3://" + k.netloc + path.sep + x.name else: clean_paths = lambda x: path.sep + x.name if list_versions: result_list_iterator = self._bucket(k.netloc).list_versions(prefix=prefix, delimiter=delimiter) else: result_list_iterator = self._bucket(k.netloc).list(prefix=prefix, delimiter=delimiter) return six.moves.map(clean_paths, result_list_iterator) elif k.scheme == 'file': if require_s3_scheme: raise InvalidSchemeException('URI should begin with s3://') paths = [] remove = '' if not return_full_urls: remove = k.path if not remove.endswith(os.sep): remove += os.sep for root, _, files in os.walk(k.path, followlinks=followlinks): for f in files: # On Windows, results of os.abspath() and os.walk() have '\', # so we replace them with '/' paths.append(path.join(root, f).replace(remove, '').replace(os.sep, path.sep)) if shallow: break return paths else: raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
def info(self, key_or_file): ''' Get info about a file ''' from datetime import datetime k = path.parse(key_or_file) result = { 'uri': '%s://%s%s' % (k.scheme, k.netloc, k.path), } if k.scheme == 'file': if not os.path.exists(k.path): raise KeyNotFound("Error getting info on %s: File doesn't exist" % (key_or_file, )) stat = os.stat(k.path) result['size'] = stat.st_size result['last_modified'] = datetime.fromtimestamp(stat.st_mtime) elif k.scheme == 's3': remote_object = self._lookup(k.netloc, k.path) if remote_object is None: raise KeyNotFound("Error getting info on %s: Key doesn't exist" % (key_or_file, )) result['size'] = remote_object.size result['last_modified'] = datetime.strptime(remote_object.last_modified, "%a, %d %b %Y %H:%M:%S GMT") result['content_type'] = remote_object.content_type result['content_encoding'] = remote_object.content_encoding result['encrypted'] = bool(remote_object.encrypted) result['acl'] = remote_object.get_acl() result['owner'] = remote_object.owner result['version_id'] = remote_object.version_id else: raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme) return result
def size(self, key_or_file, version_id=None): ''' Return the size of a file. If it's on s3, don't download it. ''' k = path.parse(key_or_file) if k.scheme == 'file': return os.path.getsize(k.path) elif k.scheme == 's3': k = self._lookup(k.netloc, k.path, version_id=version_id) if k is None: raise KeyNotFound("s3://%s/%s not found on s3" % (k.netloc, k.path)) return k.size else: raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
def isfile(key): ''' Return true if key is file; local or s3. ''' from baiji.connection import S3Connection from baiji.exceptions import InvalidSchemeException k = parse(key) if islocal(key): #This really only ensures that scheme == 'file' return os.path.isfile(k.path) if isremote(key): # scheme == 'S3' # exists currently only works for files on s3 because # directories don't exist on s3, only files. return S3Connection().exists(key) else: raise InvalidSchemeException("URI Scheme {} is not implemented".format(k.scheme))
def md5(self, key_or_file): ''' Return the MD5 checksum of a file. If it's on s3, don't download it. ''' k = path.parse(key_or_file) if k.scheme == 'file': from baiji.util.md5 import md5_for_file return md5_for_file(k.path) elif k.scheme == 's3': res = self._get_etag(k.netloc, k.path) if "-" in res: raise ValueError("md5 hashes not available from s3 for files that were uploaded as multipart (if over 5gb, there's no hope; if under, try copying it to itself to have S3 reset the etag)") return res else: raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
def __init__(self, key, connection): import re from baiji import path self.raw = key self.connection = connection self.parsed = path.parse(key) self.remote_path = None # value here will be set by the path setting, this just satisfies lint self.isdir = path.isdirlike(key) self.path = self.parsed.path if not (self.path.startswith(path.sep) or re.match(r'^[a-zA-Z]:', self.path)): self.path = path.sep + self.path self.bucket_name = self.parsed.netloc self.scheme = self.parsed.scheme if self.scheme not in ['file', 's3']: raise InvalidSchemeException( "URI Scheme %s is not implemented" % self.scheme)
def encrypt_at_rest(self, key): ''' This method takes a key on s3 and encrypts it. Note that calling this method on a local file is an error and that calling it on an s3 key that is already encrypted, while allowed, is a no-op. ''' k = path.parse(key) if k.scheme != 's3': raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme) remote_object = self._lookup(k.netloc, k.path) if remote_object is None: raise KeyNotFound("Error encrypting %s: Key doesn't exist" % (key, )) if not bool(remote_object.encrypted): bucket = self._bucket(k.netloc) src = k.path if src.startswith(path.sep): src = src[len(path.sep):] # NB: copy_key is failing with absolute src keys... bucket.copy_key(src, k.netloc, src, preserve_acl=True, metadata=None, encrypt_key=True)
def etag(self, key_or_file): ''' Return the s3 etag of the file. For single part uploads (for us, files less than 5gb) this is the same as md5. ''' from baiji.copy import S3_MAX_UPLOAD_SIZE k = path.parse(key_or_file) if k.scheme == 'file': import math from baiji.util.md5 import md5_for_file file_size = os.path.getsize(k.path) if file_size > S3_MAX_UPLOAD_SIZE: n_parts = int(math.ceil(float(file_size) / S3_MAX_UPLOAD_SIZE)) return self._build_etag(k.path, n_parts, S3_MAX_UPLOAD_SIZE) else: return md5_for_file(k.path) elif k.scheme == 's3': return self._get_etag(k.netloc, k.path) else: raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
def rm_r(self, key_or_file, force=False, quiet=False): ''' Prompts for confirmation on each file when force is False. Raises an exception when not using AWS. ''' k = path.parse(key_or_file) if not k.scheme == 's3': raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme) bucket = k.netloc keys_to_delete = self.ls(key_or_file) for key_to_delete in keys_to_delete: url = "s3://%s%s" % (bucket, key_to_delete) if not force: from baiji.util.console import confirm if not confirm("Remove %s" % url): continue self.rm(url) if not quiet: print("[deleted] %s" % url)
def rm(self, key_or_file, version_id=None): ''' Remove a key from AWS S3 ''' import shutil from baiji.util.munging import _strip_initial_slashes k = path.parse(key_or_file) if k.scheme == 'file': if os.path.isdir(k.path): shutil.rmtree(k.path) elif os.path.exists(k.path): return os.remove(k.path) else: raise KeyNotFound("%s does not exist" % key_or_file) elif k.scheme == 's3': if not self.exists(key_or_file, version_id=version_id): raise KeyNotFound("%s does not exist" % key_or_file) return self._bucket(k.netloc).delete_key(_strip_initial_slashes(k.path), version_id=version_id) else: raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
def exists(self, key_or_file, retries_allowed=3, version_id=None): ''' Check if a file exists on AWS S3 Returns a boolean. If the key is not found then we recheck up to `retries_allowed` times. We only do this on s3. We've had some observations of what appears to be eventual consistency, so this makes it a bit more reliable. This does slow down the call in the case where the key does not exist. On a relatively slow, high latency connection a test of 100 tests retreiving a non-existant file gives: With retries_allowed=1: median=457.587 ms, mean=707.12387 ms With retries_allowed=3: median=722.969 ms, mean=1185.86299 ms with retries_allowed=10: median=2489.767 ms, mean=2995.34233 ms With retries_allowed=100: median=24694.0815 ms, mean=26754.64137 ms So assume that letting retries_allowed=3 will cost you a bit less than double the time. ''' k = path.parse(key_or_file) if k.scheme == 'file': return os.path.exists(k.path) elif k.scheme == 's3': retry_attempts = 0 while retry_attempts < retries_allowed: key = self._lookup(k.netloc, k.path, cache_buckets=True, version_id=version_id) if key: if retry_attempts > 0: # only if we find it after failing at least once import warnings from baiji.exceptions import EventualConsistencyWarning warnings.warn("S3 is behaving in an eventually consistent way in s3.exists({}) -- it took {} attempts to locate the key".format(key_or_file, retry_attempts+1), EventualConsistencyWarning) return True retry_attempts += 1 return False else: raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
def isdir(key): ''' Return true if key is directory-ish. That is, it ends with a path separator, or is a local directory that actually exists. On S3 a "directory" is considered to exist if one or more files exist that have the "directory" (ending with sep) as a prefix. ''' from baiji.connection import S3Connection from baiji.exceptions import InvalidSchemeException k = parse(key) if islocal(key): #This really only ensures that scheme == 'file' return os.path.isdir(k.path) if isremote(key): # scheme == 'S3' if not k.path.endswith(sep): k = parse(key + sep) try: next(S3Connection().ls(k.geturl())) return True except StopIteration: return False else: raise InvalidSchemeException("URI Scheme {} is not implemented".format(k.scheme))
def execute(self): from boto.s3.connection import S3ResponseError if not self.force and self.dst.exists(): if self.skip: import warnings warnings.warn( "Skipping existing destination copying %s to %s: Destinaton exists" % (self.src.uri, self.dst.uri)) return else: raise KeyExists("Error copying %s to %s: Destinaton exists" % (self.src.uri, self.dst.uri)) if self.dst.is_file: self.prep_local_destination() try: if self.task == ('file', 'file'): self.local_copy() elif self.task == ('file', 's3'): self.upload() elif self.task == ('s3', 'file'): self.download() elif self.task == ('s3', 's3'): self.remote_copy() else: raise InvalidSchemeException( "Copy for URI Scheme %s to %s is not implemented" % self.task) except KeyNotFound: if self.dst.is_s3: try: _ = self.dst.bucket except KeyNotFound: raise KeyNotFound( "Error copying {} to {}: Destination bucket doesn't exist" .format(self.src.uri, self.dst.uri)) if not self.src.exists(): raise KeyNotFound( "Error copying {} to {}: Source doesn't exist".format( self.src.uri, self.dst.uri)) else: raise KeyNotFound( "Error copying {} to {}: Destination doesn't exist".format( self.src.uri, self.dst.uri)) except IOError as e: import errno if e.errno == errno.ENOENT: raise KeyNotFound( "Error copying {} to {}: Source doesn't exist".format( self.src.uri, self.dst.uri)) else: raise S3Exception("Error copying {} to {}: {}".format( self.src.uri, self.dst.uri, e)) except S3ResponseError as e: if e.status == 403: raise S3Exception( "HTTP Error 403: Permission Denied on {}".format( " or ".join( [x.uri for x in [self.src, self.dst] if x.is_s3]))) else: raise