def ls(self, path_glob): """Recursively list files on S3. This doesn't list "directories" unless there's actually a corresponding key ending with a '/' (which is weird and confusing; don't make S3 keys ending in '/') To list a directory, path_glob must end with a trailing slash (foo and foo/ are different on S3) """ # support globs glob_match = GLOB_RE.match(path_glob) # if it's a "file" (doesn't end with /), just check if it exists if not glob_match and not path_glob.endswith('/'): uri = path_glob if self.get_s3_key(uri): yield uri return # we're going to search for all keys starting with base_uri if glob_match: # cut it off at first wildcard base_uri = glob_match.group(1) else: base_uri = path_glob for uri in self._s3_ls(base_uri): # enforce globbing if glob_match and not fnmatch.fnmatchcase(uri, path_glob): continue yield uri
def ls(self, path_glob): """Recursively list files on S3. This doesn't list "directories" unless there's actually a corresponding key ending with a '/' (which is weird and confusing; don't make S3 keys ending in '/') To list a directory, path_glob must end with a trailing slash (foo and foo/ are different on S3) """ log.debug("ls %s", path_glob) # clean up the base uri to ensure we have an equal uri to boto (s3://) # just incase we get passed s3n:// scheme = urlparse(path_glob).scheme # support globs glob_match = GLOB_RE.match(path_glob) # we're going to search for all keys starting with base_uri if glob_match: # cut it off at first wildcard base_uri = glob_match.group(1) else: base_uri = path_glob # Check if we're only going to get results by using a / on the end uris = self._s3_ls(base_uri) try: first = uris.next() uris = chain([first], uris) except (boto.exception.S3ResponseError, StopIteration): try: uris = self._s3_ls(base_uri.rstrip("/") + "/") except (boto.exception.S3ResponseError, StopIteration): return prev_uri = None for uri in uris: uri = "%s://%s/%s" % ((scheme,) + parse_s3_uri(uri)) # enforce globbing if glob_match and not fnmatch.fnmatchcase(uri, path_glob): continue # If there are keys /data and /data/my_file then we consider there # to be a file /data, overriding there being a directory called # /data containing a file my_file. We discard /data/my_file. if prev_uri is not None and uri.startswith(prev_uri): continue yield uri prev_uri = uri.rstrip("/") + "/"
def _path_glob_to_parsed_gcs_uri(path_glob): # support globs glob_match = GLOB_RE.match(path_glob) # we're going to search for all keys starting with base_uri if glob_match: # cut it off at first wildcard base_uri = glob_match.group(1) else: base_uri = path_glob bucket_name, base_name = parse_gcs_uri(base_uri) return bucket_name, base_name
def ls(self, path_glob): """Recursively list files on S3. *path_glob* can include ``?`` to match single characters or ``*`` to match 0 or more characters. Both ``?`` and ``*`` can match ``/``. .. versionchanged:: 0.5.0 You no longer need a trailing slash to list "directories" on S3; both ``ls('s3://b/dir')`` and `ls('s3://b/dir/')` will list all keys starting with ``dir/``. """ # clean up the base uri to ensure we have an equal uri to boto (s3://) # just in case we get passed s3n:// scheme = urlparse(path_glob).scheme # support globs glob_match = GLOB_RE.match(path_glob) # we're going to search for all keys starting with base_uri if glob_match: # cut it off at first wildcard base_uri = glob_match.group(1) else: base_uri = path_glob bucket_name, base_name = parse_s3_uri(base_uri) # allow subdirectories of the path/glob if path_glob and not path_glob.endswith('/'): dir_glob = path_glob + '/*' else: dir_glob = path_glob + '*' bucket = self.get_bucket(bucket_name) for key in bucket.list(base_name): uri = "%s://%s/%s" % (scheme, bucket_name, key.name) # enforce globbing if not (fnmatch.fnmatchcase(uri, path_glob) or fnmatch.fnmatchcase(uri, dir_glob)): continue yield uri
def _ls(self, path_glob): """Helper method for :py:meth:`ls`; yields tuples of ``(uri, key)`` where *key* is the corresponding boto3 s3.ObjectSummary. """ # clean up the base uri to ensure we have pass boto3 an s3:// URI # (not s3n://) scheme = urlparse(path_glob).scheme # support globs glob_match = GLOB_RE.match(path_glob) # we're going to search for all keys starting with base_uri if glob_match: # cut it off at first wildcard base_uri = glob_match.group(1) else: base_uri = path_glob bucket_name, base_name = parse_s3_uri(base_uri) # allow subdirectories of the path/glob if path_glob and not path_glob.endswith('/'): dir_glob = path_glob + '/*' else: dir_glob = path_glob + '*' try: bucket = self.get_bucket(bucket_name) except botocore.exceptions.ClientError as ex: if _client_error_status(ex) == 404: # treat nonexistent as empty return raise for key in bucket.objects.filter(Prefix=base_name): uri = "%s://%s/%s" % (scheme, bucket_name, key.key) # enforce globbing if not (fnmatch.fnmatchcase(uri, path_glob) or fnmatch.fnmatchcase(uri, dir_glob)): continue yield uri, key
def ls(self, path_glob): """Recursively list files on S3. This doesn't list "directories" unless there's actually a corresponding key ending with a '/' (which is weird and confusing; don't make S3 keys ending in '/') To list a directory, path_glob must end with a trailing slash (foo and foo/ are different on S3) """ # clean up the base uri to ensure we have an equal uri to boto (s3://) # just in case we get passed s3n:// scheme = urlparse(path_glob).scheme # support globs glob_match = GLOB_RE.match(path_glob) # if it's a "file" (doesn't end with /), just check if it exists if not glob_match and not path_glob.endswith("/"): uri = path_glob if self.get_s3_key(uri): yield uri return # we're going to search for all keys starting with base_uri if glob_match: # cut it off at first wildcard base_uri = glob_match.group(1) else: base_uri = path_glob for uri in self._s3_ls(base_uri): uri = "%s://%s/%s" % ((scheme,) + parse_s3_uri(uri)) # enforce globbing if glob_match and not fnmatch.fnmatchcase(uri, path_glob): continue yield uri
def _ls(self, path_glob): """Helper method for :py:meth:`ls`; yields tuples of ``(uri, blob)`` where *blob* is the corresponding :py:class:`google.cloud.storage.blob.Blob`. This *will* return empty "directory" globs. """ # support globs glob_match = GLOB_RE.match(path_glob) # we're going to search for all keys starting with base_uri if glob_match: # cut it off at first wildcard base_uri = glob_match.group(1) else: base_uri = path_glob bucket_name, base_name = parse_gcs_uri(base_uri) # allow subdirectories of the path/glob if path_glob and not path_glob.endswith('/'): dir_glob = path_glob + '/*' else: dir_glob = path_glob + '*' try: bucket = self.get_bucket(bucket_name) except NotFound: return # treat nonexistent buckets as empty for blob in bucket.list_blobs(prefix=base_name): uri = "gs://%s/%s" % (bucket_name, blob.name) # enforce globbing if not (fnmatch.fnmatchcase(uri, path_glob) or fnmatch.fnmatchcase(uri, dir_glob)): continue yield uri, blob
def _ls(self, path_glob): """Helper method for :py:meth:`ls`; yields tuples of ``(uri, blob)`` where *blob* is the corresponding :py:class:`google.cloud.storage.blob.Blob`. This *will* return empty "directory" globs. """ # support globs glob_match = GLOB_RE.match(path_glob) # we're going to search for all keys starting with base_uri if glob_match: # cut it off at first wildcard base_uri = glob_match.group(1) else: base_uri = path_glob bucket_name, base_name = parse_gcs_uri(base_uri) # allow subdirectories of the path/glob if path_glob and not path_glob.endswith('/'): dir_glob = path_glob + '/*' else: dir_glob = path_glob + '*' try: bucket = self.get_bucket(bucket_name) except google.api_core.exceptions.NotFound: return # treat nonexistent buckets as empty for blob in bucket.list_blobs(prefix=base_name): uri = "gs://%s/%s" % (bucket_name, blob.name) # enforce globbing if not (fnmatch.fnmatchcase(uri, path_glob) or fnmatch.fnmatchcase(uri, dir_glob)): continue yield uri, blob