Example #1
0
    def ls(self, s3prefix, return_full_urls=False, require_s3_scheme=False, shallow=False, followlinks=False, list_versions=False):
        '''
        List files on AWS S3
        prefix is given as an S3 url: ``s3://bucket-name/path/to/dir``.
        It will return all values in the bucket that have that prefix.

        Note that ``/dir/filename.ext`` is found by ``ls('s3://bucket-name/dir/fil')``; it's really a prefix and not a directory name.

        A local prefix generally is acceptable, but if require_s3_scheme
        is True, the prefix must be an s3 URL.

        If `shallow` is `True`, the key names are processed hierarchically
        using '/' as a delimiter, and only the immediate "children" are
        returned.

        '''
        import six
        k = path.parse(s3prefix)
        if k.scheme == 's3':
            prefix = k.path
            if prefix.startswith(path.sep):
                prefix = prefix[len(path.sep):]
            delimiter = shallow and path.sep or ''
            if return_full_urls:
                clean_paths = lambda x: "s3://" + k.netloc + path.sep + x.name
            else:
                clean_paths = lambda x: path.sep + x.name

            if list_versions:
                result_list_iterator = self._bucket(k.netloc).list_versions(prefix=prefix, delimiter=delimiter)
            else:
                result_list_iterator = self._bucket(k.netloc).list(prefix=prefix, delimiter=delimiter)

            return six.moves.map(clean_paths, result_list_iterator)
        elif k.scheme == 'file':
            if require_s3_scheme:
                raise InvalidSchemeException('URI should begin with s3://')
            paths = []
            remove = ''
            if not return_full_urls:
                remove = k.path
                if not remove.endswith(os.sep):
                    remove += os.sep
            for root, _, files in os.walk(k.path, followlinks=followlinks):
                for f in files:
                    # On Windows, results of os.abspath() and os.walk() have '\',
                    # so we replace them with '/'
                    paths.append(path.join(root, f).replace(remove, '').replace(os.sep, path.sep))
                if shallow:
                    break
            return paths
        else:
            raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
Example #2
0
 def info(self, key_or_file):
     '''
     Get info about a file
     '''
     from datetime import datetime
     k = path.parse(key_or_file)
     result = {
         'uri': '%s://%s%s' % (k.scheme, k.netloc, k.path),
     }
     if k.scheme == 'file':
         if not os.path.exists(k.path):
             raise KeyNotFound("Error getting info on %s: File doesn't exist" % (key_or_file, ))
         stat = os.stat(k.path)
         result['size'] = stat.st_size
         result['last_modified'] = datetime.fromtimestamp(stat.st_mtime)
     elif k.scheme == 's3':
         remote_object = self._lookup(k.netloc, k.path)
         if remote_object is None:
             raise KeyNotFound("Error getting info on %s: Key doesn't exist" % (key_or_file, ))
         result['size'] = remote_object.size
         result['last_modified'] = datetime.strptime(remote_object.last_modified, "%a, %d %b %Y %H:%M:%S GMT")
         result['content_type'] = remote_object.content_type
         result['content_encoding'] = remote_object.content_encoding
         result['encrypted'] = bool(remote_object.encrypted)
         result['acl'] = remote_object.get_acl()
         result['owner'] = remote_object.owner
         result['version_id'] = remote_object.version_id
     else:
         raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
     return result
Example #3
0
 def size(self, key_or_file, version_id=None):
     '''
     Return the size of a file. If it's on s3, don't download it.
     '''
     k = path.parse(key_or_file)
     if k.scheme == 'file':
         return os.path.getsize(k.path)
     elif k.scheme == 's3':
         k = self._lookup(k.netloc, k.path, version_id=version_id)
         if k is None:
             raise KeyNotFound("s3://%s/%s not found on s3" % (k.netloc, k.path))
         return k.size
     else:
         raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
Example #4
0
def isfile(key):
    '''
    Return true if key is file; local or s3.
    '''
    from baiji.connection import S3Connection
    from baiji.exceptions import InvalidSchemeException
    k = parse(key)
    if islocal(key): #This really only ensures that scheme == 'file'
        return os.path.isfile(k.path)
    if isremote(key): # scheme == 'S3'
        # exists currently only works for files on s3 because
        # directories don't exist on s3, only files.
        return S3Connection().exists(key)
    else:
        raise InvalidSchemeException("URI Scheme {} is not implemented".format(k.scheme))
Example #5
0
 def md5(self, key_or_file):
     '''
     Return the MD5 checksum of a file. If it's on s3, don't download it.
     '''
     k = path.parse(key_or_file)
     if k.scheme == 'file':
         from baiji.util.md5 import md5_for_file
         return md5_for_file(k.path)
     elif k.scheme == 's3':
         res = self._get_etag(k.netloc, k.path)
         if "-" in res:
             raise ValueError("md5 hashes not available from s3 for files that were uploaded as multipart (if over 5gb, there's no hope; if under, try copying it to itself to have S3 reset the etag)")
         return res
     else:
         raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
Example #6
0
 def __init__(self, key, connection):
     import re
     from baiji import path
     self.raw = key
     self.connection = connection
     self.parsed = path.parse(key)
     self.remote_path = None  # value here will be set by the path setting, this just satisfies lint
     self.isdir = path.isdirlike(key)
     self.path = self.parsed.path
     if not (self.path.startswith(path.sep)
             or re.match(r'^[a-zA-Z]:', self.path)):
         self.path = path.sep + self.path
     self.bucket_name = self.parsed.netloc
     self.scheme = self.parsed.scheme
     if self.scheme not in ['file', 's3']:
         raise InvalidSchemeException(
             "URI Scheme %s is not implemented" % self.scheme)
Example #7
0
 def encrypt_at_rest(self, key):
     '''
     This method takes a key on s3 and encrypts it.
     Note that calling this method on a local file is an error
     and that calling it on an s3 key that is already encrypted,
     while allowed, is a no-op.
     '''
     k = path.parse(key)
     if k.scheme != 's3':
         raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
     remote_object = self._lookup(k.netloc, k.path)
     if remote_object is None:
         raise KeyNotFound("Error encrypting %s: Key doesn't exist" % (key, ))
     if not bool(remote_object.encrypted):
         bucket = self._bucket(k.netloc)
         src = k.path
         if src.startswith(path.sep):
             src = src[len(path.sep):] # NB: copy_key is failing with absolute src keys...
         bucket.copy_key(src, k.netloc, src, preserve_acl=True, metadata=None, encrypt_key=True)
Example #8
0
 def etag(self, key_or_file):
     '''
     Return the s3 etag of the file. For single part uploads (for us, files less than 5gb) this is the same as md5.
     '''
     from baiji.copy import S3_MAX_UPLOAD_SIZE
     k = path.parse(key_or_file)
     if k.scheme == 'file':
         import math
         from baiji.util.md5 import md5_for_file
         file_size = os.path.getsize(k.path)
         if file_size > S3_MAX_UPLOAD_SIZE:
             n_parts = int(math.ceil(float(file_size) / S3_MAX_UPLOAD_SIZE))
             return self._build_etag(k.path, n_parts, S3_MAX_UPLOAD_SIZE)
         else:
             return md5_for_file(k.path)
     elif k.scheme == 's3':
         return self._get_etag(k.netloc, k.path)
     else:
         raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
Example #9
0
    def rm_r(self, key_or_file, force=False, quiet=False):
        '''
        Prompts for confirmation on each file when force is False.

        Raises an exception when not using AWS.
        '''
        k = path.parse(key_or_file)
        if not k.scheme == 's3':
            raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
        bucket = k.netloc
        keys_to_delete = self.ls(key_or_file)
        for key_to_delete in keys_to_delete:
            url = "s3://%s%s" % (bucket, key_to_delete)
            if not force:
                from baiji.util.console import confirm
                if not confirm("Remove %s" % url):
                    continue
            self.rm(url)
            if not quiet:
                print("[deleted] %s" % url)
Example #10
0
 def rm(self, key_or_file, version_id=None):
     '''
     Remove a key from AWS S3
     '''
     import shutil
     from baiji.util.munging import _strip_initial_slashes
     k = path.parse(key_or_file)
     if k.scheme == 'file':
         if os.path.isdir(k.path):
             shutil.rmtree(k.path)
         elif os.path.exists(k.path):
             return os.remove(k.path)
         else:
             raise KeyNotFound("%s does not exist" % key_or_file)
     elif k.scheme == 's3':
         if not self.exists(key_or_file, version_id=version_id):
             raise KeyNotFound("%s does not exist" % key_or_file)
         return self._bucket(k.netloc).delete_key(_strip_initial_slashes(k.path), version_id=version_id)
     else:
         raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
Example #11
0
    def exists(self, key_or_file, retries_allowed=3, version_id=None):
        '''
        Check if a file exists on AWS S3

        Returns a boolean.

        If the key is not found then we recheck up to `retries_allowed` times. We only do this
        on s3. We've had some observations of what appears to be eventual consistency, so this
        makes it a bit more reliable. This does slow down the call in the case where the key
        does not exist.

        On a relatively slow, high latency connection a test of 100 tests retreiving a
        non-existant file gives:

        With retries_allowed=1: median=457.587 ms, mean=707.12387 ms
        With retries_allowed=3: median=722.969 ms, mean=1185.86299 ms
        with retries_allowed=10: median=2489.767 ms, mean=2995.34233 ms
        With retries_allowed=100: median=24694.0815 ms, mean=26754.64137 ms

        So assume that letting retries_allowed=3 will cost you a bit less than double the time.
        '''
        k = path.parse(key_or_file)
        if k.scheme == 'file':
            return os.path.exists(k.path)
        elif k.scheme == 's3':
            retry_attempts = 0
            while retry_attempts < retries_allowed:
                key = self._lookup(k.netloc, k.path, cache_buckets=True, version_id=version_id)
                if key:
                    if retry_attempts > 0: # only if we find it after failing at least once
                        import warnings
                        from baiji.exceptions import EventualConsistencyWarning
                        warnings.warn("S3 is behaving in an eventually consistent way in s3.exists({}) -- it took {} attempts to locate the key".format(key_or_file, retry_attempts+1), EventualConsistencyWarning)
                    return True
                retry_attempts += 1
            return False
        else:
            raise InvalidSchemeException("URI Scheme %s is not implemented" % k.scheme)
Example #12
0
def isdir(key):
    '''
    Return true if key is directory-ish. That is, it ends with a path
    separator, or is a local directory that actually exists.
    On S3 a "directory" is considered to exist if one or more files exist
    that have the "directory" (ending with sep) as a prefix.
    '''
    from baiji.connection import S3Connection
    from baiji.exceptions import InvalidSchemeException

    k = parse(key)
    if islocal(key): #This really only ensures that scheme == 'file'
        return os.path.isdir(k.path)
    if isremote(key): # scheme == 'S3'
        if not k.path.endswith(sep):
            k = parse(key + sep)
        try:
            next(S3Connection().ls(k.geturl()))
            return True
        except StopIteration:
            return False
    else:
        raise InvalidSchemeException("URI Scheme {} is not implemented".format(k.scheme))
Example #13
0
    def execute(self):
        from boto.s3.connection import S3ResponseError
        if not self.force and self.dst.exists():
            if self.skip:
                import warnings
                warnings.warn(
                    "Skipping existing destination copying %s to %s: Destinaton exists"
                    % (self.src.uri, self.dst.uri))
                return
            else:
                raise KeyExists("Error copying %s to %s: Destinaton exists" %
                                (self.src.uri, self.dst.uri))

        if self.dst.is_file:
            self.prep_local_destination()

        try:
            if self.task == ('file', 'file'):
                self.local_copy()
            elif self.task == ('file', 's3'):
                self.upload()
            elif self.task == ('s3', 'file'):
                self.download()
            elif self.task == ('s3', 's3'):
                self.remote_copy()
            else:
                raise InvalidSchemeException(
                    "Copy for URI Scheme %s to %s is not implemented" %
                    self.task)
        except KeyNotFound:
            if self.dst.is_s3:
                try:
                    _ = self.dst.bucket
                except KeyNotFound:
                    raise KeyNotFound(
                        "Error copying {} to {}: Destination bucket doesn't exist"
                        .format(self.src.uri, self.dst.uri))
            if not self.src.exists():
                raise KeyNotFound(
                    "Error copying {} to {}: Source doesn't exist".format(
                        self.src.uri, self.dst.uri))
            else:
                raise KeyNotFound(
                    "Error copying {} to {}: Destination doesn't exist".format(
                        self.src.uri, self.dst.uri))
        except IOError as e:
            import errno
            if e.errno == errno.ENOENT:
                raise KeyNotFound(
                    "Error copying {} to {}: Source doesn't exist".format(
                        self.src.uri, self.dst.uri))
            else:
                raise S3Exception("Error copying {} to {}: {}".format(
                    self.src.uri, self.dst.uri, e))
        except S3ResponseError as e:
            if e.status == 403:
                raise S3Exception(
                    "HTTP Error 403: Permission Denied on {}".format(
                        " or ".join(
                            [x.uri for x in [self.src, self.dst] if x.is_s3])))
            else:
                raise