Example #1
0
File: s3.py Project: yuanda/mrjob
    def ls(self, path_glob):
        """Recursively list files on S3.

        This doesn't list "directories" unless there's actually a
        corresponding key ending with a '/' (which is weird and confusing;
        don't make S3 keys ending in '/')

        To list a directory, path_glob must end with a trailing
        slash (foo and foo/ are different on S3)
        """
        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # if it's a "file" (doesn't end with /), just check if it exists
        if not glob_match and not path_glob.endswith('/'):
            uri = path_glob
            if self.get_s3_key(uri):
                yield uri
            return

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        for uri in self._s3_ls(base_uri):
            # enforce globbing
            if glob_match and not fnmatch.fnmatchcase(uri, path_glob):
                continue

            yield uri
Example #2
0
    def ls(self, path_glob):
        """Recursively list files on S3.

        This doesn't list "directories" unless there's actually a
        corresponding key ending with a '/' (which is weird and confusing;
        don't make S3 keys ending in '/')

        To list a directory, path_glob must end with a trailing
        slash (foo and foo/ are different on S3)
        """
        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # if it's a "file" (doesn't end with /), just check if it exists
        if not glob_match and not path_glob.endswith('/'):
            uri = path_glob
            if self.get_s3_key(uri):
                yield uri
            return

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        for uri in self._s3_ls(base_uri):
            # enforce globbing
            if glob_match and not fnmatch.fnmatchcase(uri, path_glob):
                continue

            yield uri
Example #3
0
    def ls(self, path_glob):
        """Recursively list files on S3.

        This doesn't list "directories" unless there's actually a
        corresponding key ending with a '/' (which is weird and confusing;
        don't make S3 keys ending in '/')

        To list a directory, path_glob must end with a trailing
        slash (foo and foo/ are different on S3)
        """

        log.debug("ls %s", path_glob)

        # clean up the  base uri to ensure we have an equal uri to boto (s3://)
        # just incase we get passed s3n://
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        # Check if we're only going to get results by using a / on the end
        uris = self._s3_ls(base_uri)
        try:
            first = uris.next()
            uris = chain([first], uris)
        except (boto.exception.S3ResponseError, StopIteration):
            try:
                uris = self._s3_ls(base_uri.rstrip("/") + "/")
            except (boto.exception.S3ResponseError, StopIteration):
                return

        prev_uri = None
        for uri in uris:
            uri = "%s://%s/%s" % ((scheme,) + parse_s3_uri(uri))

            # enforce globbing
            if glob_match and not fnmatch.fnmatchcase(uri, path_glob):
                continue

            # If there are keys /data and /data/my_file then we consider there
            # to be a file /data, overriding there being a directory called
            # /data containing a file my_file. We discard /data/my_file.
            if prev_uri is not None and uri.startswith(prev_uri):
                continue

            yield uri
            prev_uri = uri.rstrip("/") + "/"
Example #4
0
def _path_glob_to_parsed_gcs_uri(path_glob):
    # support globs
    glob_match = GLOB_RE.match(path_glob)

    # we're going to search for all keys starting with base_uri
    if glob_match:
        # cut it off at first wildcard
        base_uri = glob_match.group(1)
    else:
        base_uri = path_glob

    bucket_name, base_name = parse_gcs_uri(base_uri)
    return bucket_name, base_name
Example #5
0
def _path_glob_to_parsed_gcs_uri(path_glob):
    # support globs
    glob_match = GLOB_RE.match(path_glob)

    # we're going to search for all keys starting with base_uri
    if glob_match:
        # cut it off at first wildcard
        base_uri = glob_match.group(1)
    else:
        base_uri = path_glob

    bucket_name, base_name = parse_gcs_uri(base_uri)
    return bucket_name, base_name
Example #6
0
    def ls(self, path_glob):
        """Recursively list files on S3.

        *path_glob* can include ``?`` to match single characters or
        ``*`` to match 0 or more characters. Both ``?`` and ``*`` can match
        ``/``.

        .. versionchanged:: 0.5.0

            You no longer need a trailing slash to list "directories" on S3;
            both ``ls('s3://b/dir')`` and `ls('s3://b/dir/')` will list
            all keys starting with ``dir/``.
        """

        # clean up the  base uri to ensure we have an equal uri to boto (s3://)
        # just in case we get passed s3n://
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        bucket_name, base_name = parse_s3_uri(base_uri)

        # allow subdirectories of the path/glob
        if path_glob and not path_glob.endswith('/'):
            dir_glob = path_glob + '/*'
        else:
            dir_glob = path_glob + '*'

        bucket = self.get_bucket(bucket_name)
        for key in bucket.list(base_name):
            uri = "%s://%s/%s" % (scheme, bucket_name, key.name)

            # enforce globbing
            if not (fnmatch.fnmatchcase(uri, path_glob) or
                    fnmatch.fnmatchcase(uri, dir_glob)):
                continue

            yield uri
Example #7
0
    def ls(self, path_glob):
        """Recursively list files on S3.

        *path_glob* can include ``?`` to match single characters or
        ``*`` to match 0 or more characters. Both ``?`` and ``*`` can match
        ``/``.

        .. versionchanged:: 0.5.0

            You no longer need a trailing slash to list "directories" on S3;
            both ``ls('s3://b/dir')`` and `ls('s3://b/dir/')` will list
            all keys starting with ``dir/``.
        """

        # clean up the  base uri to ensure we have an equal uri to boto (s3://)
        # just in case we get passed s3n://
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        bucket_name, base_name = parse_s3_uri(base_uri)

        # allow subdirectories of the path/glob
        if path_glob and not path_glob.endswith('/'):
            dir_glob = path_glob + '/*'
        else:
            dir_glob = path_glob + '*'

        bucket = self.get_bucket(bucket_name)
        for key in bucket.list(base_name):
            uri = "%s://%s/%s" % (scheme, bucket_name, key.name)

            # enforce globbing
            if not (fnmatch.fnmatchcase(uri, path_glob)
                    or fnmatch.fnmatchcase(uri, dir_glob)):
                continue

            yield uri
Example #8
0
    def _ls(self, path_glob):
        """Helper method for :py:meth:`ls`; yields tuples of
        ``(uri, key)`` where *key* is the corresponding boto3 s3.ObjectSummary.
        """
        # clean up the  base uri to ensure we have pass boto3 an s3:// URI
        # (not s3n://)
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        bucket_name, base_name = parse_s3_uri(base_uri)

        # allow subdirectories of the path/glob
        if path_glob and not path_glob.endswith('/'):
            dir_glob = path_glob + '/*'
        else:
            dir_glob = path_glob + '*'

        try:
            bucket = self.get_bucket(bucket_name)
        except botocore.exceptions.ClientError as ex:
            if _client_error_status(ex) == 404:  # treat nonexistent as empty
                return
            raise

        for key in bucket.objects.filter(Prefix=base_name):
            uri = "%s://%s/%s" % (scheme, bucket_name, key.key)

            # enforce globbing
            if not (fnmatch.fnmatchcase(uri, path_glob) or
                    fnmatch.fnmatchcase(uri, dir_glob)):
                continue

            yield uri, key
Example #9
0
    def _ls(self, path_glob):
        """Helper method for :py:meth:`ls`; yields tuples of
        ``(uri, key)`` where *key* is the corresponding boto3 s3.ObjectSummary.
        """
        # clean up the  base uri to ensure we have pass boto3 an s3:// URI
        # (not s3n://)
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        bucket_name, base_name = parse_s3_uri(base_uri)

        # allow subdirectories of the path/glob
        if path_glob and not path_glob.endswith('/'):
            dir_glob = path_glob + '/*'
        else:
            dir_glob = path_glob + '*'

        try:
            bucket = self.get_bucket(bucket_name)
        except botocore.exceptions.ClientError as ex:
            if _client_error_status(ex) == 404:  # treat nonexistent as empty
                return
            raise

        for key in bucket.objects.filter(Prefix=base_name):
            uri = "%s://%s/%s" % (scheme, bucket_name, key.key)

            # enforce globbing
            if not (fnmatch.fnmatchcase(uri, path_glob)
                    or fnmatch.fnmatchcase(uri, dir_glob)):
                continue

            yield uri, key
Example #10
0
File: s3.py Project: mikekap/mrjob
    def ls(self, path_glob):
        """Recursively list files on S3.

        This doesn't list "directories" unless there's actually a
        corresponding key ending with a '/' (which is weird and confusing;
        don't make S3 keys ending in '/')

        To list a directory, path_glob must end with a trailing
        slash (foo and foo/ are different on S3)
        """

        # clean up the  base uri to ensure we have an equal uri to boto (s3://)
        # just in case we get passed s3n://
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # if it's a "file" (doesn't end with /), just check if it exists
        if not glob_match and not path_glob.endswith("/"):
            uri = path_glob
            if self.get_s3_key(uri):
                yield uri
            return

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        for uri in self._s3_ls(base_uri):
            uri = "%s://%s/%s" % ((scheme,) + parse_s3_uri(uri))

            # enforce globbing
            if glob_match and not fnmatch.fnmatchcase(uri, path_glob):
                continue

            yield uri
Example #11
0
    def _ls(self, path_glob):
        """Helper method for :py:meth:`ls`; yields tuples of
        ``(uri, blob)`` where *blob* is the corresponding
        :py:class:`google.cloud.storage.blob.Blob`.

        This *will* return empty "directory" globs.
        """
        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        bucket_name, base_name = parse_gcs_uri(base_uri)

        # allow subdirectories of the path/glob
        if path_glob and not path_glob.endswith('/'):
            dir_glob = path_glob + '/*'
        else:
            dir_glob = path_glob + '*'

        try:
            bucket = self.get_bucket(bucket_name)
        except NotFound:
            return  # treat nonexistent buckets as empty

        for blob in bucket.list_blobs(prefix=base_name):
            uri = "gs://%s/%s" % (bucket_name, blob.name)

            # enforce globbing
            if not (fnmatch.fnmatchcase(uri, path_glob)
                    or fnmatch.fnmatchcase(uri, dir_glob)):
                continue

            yield uri, blob
Example #12
0
File: gcs.py Project: Affirm/mrjob
    def _ls(self, path_glob):
        """Helper method for :py:meth:`ls`; yields tuples of
        ``(uri, blob)`` where *blob* is the corresponding
        :py:class:`google.cloud.storage.blob.Blob`.

        This *will* return empty "directory" globs.
        """
        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        bucket_name, base_name = parse_gcs_uri(base_uri)

        # allow subdirectories of the path/glob
        if path_glob and not path_glob.endswith('/'):
            dir_glob = path_glob + '/*'
        else:
            dir_glob = path_glob + '*'

        try:
            bucket = self.get_bucket(bucket_name)
        except google.api_core.exceptions.NotFound:
            return  # treat nonexistent buckets as empty

        for blob in bucket.list_blobs(prefix=base_name):
            uri = "gs://%s/%s" % (bucket_name, blob.name)

            # enforce globbing
            if not (fnmatch.fnmatchcase(uri, path_glob) or
                    fnmatch.fnmatchcase(uri, dir_glob)):
                continue

            yield uri, blob