Beispiel #1
0
 def __iter__(self):
     wildcard = self.wildcard_uri.object_name
     match = re.search('\*\*', wildcard)
     if match:
         # Recursive wildcarding request ('.../**/...').
         # Example input: wildcard = '/tmp/tmp2pQJAX/**/*'
         base_dir = wildcard[:match.start() - 1]
         remaining_wildcard = wildcard[match.start() + 2:]
         # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and
         # remaining_wildcard = '/*'
         if remaining_wildcard.startswith('*'):
             raise WildcardException(
                 'Invalid wildcard with more than 2 consecutive '
                 '*s (%s)' % wildcard)
         # If there was no remaining wildcard past the recursive wildcard,
         # treat it as if it were a '*'. For example, file://tmp/** is equivalent
         # to file://tmp/**/*
         if not remaining_wildcard:
             remaining_wildcard = '*'
         # Skip slash(es).
         remaining_wildcard = remaining_wildcard.lstrip(os.sep)
         filepaths = self._iter_dir(base_dir, remaining_wildcard)
     else:
         # Not a recursive wildcarding request.
         filepaths = glob.iglob(wildcard)
     for filepath in filepaths:
         expanded_uri = self.wildcard_uri.clone_replace_name(filepath)
         yield BucketListingRef(expanded_uri)
Beispiel #2
0
    def _DoImplicitBucketSubdirExpansionIfApplicable(self, uri, flat):
        """
    Checks whether uri could be an implicit bucket subdir, and expands if so;
    else returns list containing uri. For example gs://abc would be an implicit
    bucket subdir if the -R option was specified and gs://abc/* matches
    anything.
    Can only be called for -R (recursion requested).

    Args:
      uri: StorageUri.
      flat: bool indicating whether bucket listings should be flattened, i.e.,
          so the mapped-to results contain objects spanning subdirectories.

    Returns:
      tuple (names_container, [BucketListingRefs to which uri expanded])
        where names_container is true if URI names a directory, bucket,
        or bucket subdir (vs how StorageUri.names_container() doesn't
        handle latter case).
    """
        names_container = False
        result_list = []
        if uri.names_object():
            # URI could be a bucket subdir.
            implicit_subdir_matches = list(
                self.WildcardIterator(
                    self.suri_builder.StorageUri(
                        '%s/%s' %
                        (uri.uri.rstrip('/'), self._flatness_wildcard[flat]))))
            if len(implicit_subdir_matches) > 0:
                names_container = True
                result_list.extend(implicit_subdir_matches)
            else:
                result_list.append(BucketListingRef(uri))
        else:
            result_list.append(BucketListingRef(uri))
        return (names_container, result_list)
Beispiel #3
0
 def __iter__(self):
   empty = True
   for blr in self.blr_iter:
     uri = blr.GetUri()
     if not uri.names_object():
       empty = False
       yield (True, blr)
       break
     for key in uri.list_bucket(
         prefix=uri.object_name, headers=self.headers, all_versions=True):
       if key.name != uri.object_name:
         # The desired entries will be alphabetically first in this listing.
         break
       version_blr = BucketListingRef(uri.clone_replace_key(key), key=key)
       empty = False
       yield (False, version_blr)
     # If no version exists, yield the unversioned blr, and let the consuming
     # operation fail. This mirrors behavior in _ImplicitBucketSubdirIterator.
     if empty:
       yield (False, blr)
Beispiel #4
0
    def __iter__(self):
        for uri_str in self.uri_strs:
            # Step 1: Expand any explicitly specified wildcards. The output from this
            # step is an iterator of BucketListingRef.
            # Starting with gs://buck*/abc* this step would expand to gs://bucket/abcd
            if ContainsWildcard(uri_str):
                post_step1_iter = self._WildcardIterator(uri_str)
            else:
                suri = self.suri_builder.StorageUri(uri_str)
                post_step1_iter = iter([BucketListingRef(suri)])
            post_step1_iter = PluralityCheckableIterator(post_step1_iter)

            # Step 2: Expand bucket subdirs and versions. The output from this
            # step is an iterator of (names_container, BucketListingRef).
            # Starting with gs://bucket/abcd this step would expand to:
            #   iter([(True, abcd/o1.txt), (True, abcd/o2.txt)]).
            if self.flat and self.recursion_requested:
                post_step2_iter = _ImplicitBucketSubdirIterator(
                    self, post_step1_iter, self.flat)
            elif self.all_versions:
                post_step2_iter = _AllVersionIterator(self,
                                                      post_step1_iter,
                                                      headers=self.headers)
            else:
                post_step2_iter = _NonContainerTuplifyIterator(post_step1_iter)
            post_step2_iter = PluralityCheckableIterator(post_step2_iter)

            # Step 3. Expand directories and buckets. This step yields the iterated
            # values. Starting with gs://bucket this step would expand to:
            #  [abcd/o1.txt, abcd/o2.txt, xyz/o1.txt, xyz/o2.txt]
            # Starting with file://dir this step would expand to:
            #  [dir/a.txt, dir/b.txt, dir/c/]
            exp_src_bucket_listing_refs = []
            wc = self._flatness_wildcard[self.flat]
            src_uri_expands_to_multi = (post_step1_iter.has_plurality()
                                        or post_step2_iter.has_plurality())
            is_multi_src_request = (self.uri_strs.has_plurality()
                                    or src_uri_expands_to_multi)

            if post_step2_iter.is_empty():
                raise CommandException('No URIs matched: %s' % uri_str)
            for (names_container, blr) in post_step2_iter:
                if (not blr.GetUri().names_container()
                        and (self.flat or not blr.HasPrefix())):
                    yield NameExpansionResult(uri_str,
                                              is_multi_src_request,
                                              src_uri_expands_to_multi,
                                              names_container,
                                              blr.GetUriString(),
                                              self.have_existing_dst_container,
                                              is_latest=blr.IsLatest())
                    continue
                if not self.recursion_requested:
                    if blr.GetUri().is_file_uri():
                        desc = 'directory'
                    else:
                        desc = 'bucket'
                    print 'Omitting %s "%s". (Did you mean to do %s -R?)' % (
                        desc, blr.GetUri(), self.command_name)
                    continue
                if blr.GetUri().is_file_uri():
                    # Convert dir to implicit recursive wildcard.
                    uri_to_iterate = '%s/%s' % (blr.GetUriString(), wc)
                else:
                    # Convert bucket to implicit recursive wildcard.
                    uri_to_iterate = blr.GetUri().clone_replace_name(wc)
                wc_iter = PluralityCheckableIterator(
                    self._WildcardIterator(uri_to_iterate))
                src_uri_expands_to_multi = (src_uri_expands_to_multi
                                            or wc_iter.has_plurality())
                is_multi_src_request = (self.uri_strs.has_plurality()
                                        or src_uri_expands_to_multi)
                for blr in wc_iter:
                    yield NameExpansionResult(uri_str,
                                              is_multi_src_request,
                                              src_uri_expands_to_multi,
                                              True,
                                              blr.GetUriString(),
                                              self.have_existing_dst_container,
                                              is_latest=blr.IsLatest())
Beispiel #5
0
    def __iter__(self):
        """Python iterator that gets called when iterating over cloud wildcard.

    Yields:
      BucketListingRef, or empty iterator if no matches.
    """
        # First handle bucket wildcarding, if any.
        if ContainsWildcard(self.wildcard_uri.bucket_name):
            regex = fnmatch.translate(self.wildcard_uri.bucket_name)
            bucket_uris = []
            prog = re.compile(regex)
            self.proj_id_handler.FillInProjectHeaderIfNeeded(
                WILDCARD_BUCKET_ITERATOR, self.wildcard_uri, self.headers)
            for b in self.wildcard_uri.get_all_buckets(headers=self.headers):
                if prog.match(b.name):
                    # Use str(b.name) because get_all_buckets() returns Unicode
                    # string, which when used to construct x-goog-copy-src metadata
                    # requests for object-to-object copies causes pathname '/' chars
                    # to be entity-encoded (bucket%2Fdir instead of bucket/dir),
                    # which causes the request to fail.
                    uri_str = '%s://%s' % (self.wildcard_uri.scheme,
                                           urllib.quote_plus(str(b.name)))
                    # TODO: Move bucket_uris to a separate generator function that yields
                    # values instead of pre-computing the list.
                    bucket_uris.append(
                        boto.storage_uri(uri_str,
                                         debug=self.debug,
                                         bucket_storage_uri_class=self.
                                         bucket_storage_uri_class,
                                         suppress_consec_slashes=False))
        else:
            bucket_uris = [self.wildcard_uri.clone_replace_name('')]

        # Now iterate over bucket(s), and handle object wildcarding, if any.
        self.proj_id_handler.FillInProjectHeaderIfNeeded(
            WILDCARD_OBJECT_ITERATOR, self.wildcard_uri, self.headers)
        for bucket_uri in bucket_uris:
            if self.wildcard_uri.names_bucket():
                # Bucket-only URI.
                yield BucketListingRef(bucket_uri,
                                       key=None,
                                       prefix=None,
                                       headers=self.headers)
            else:
                # URI contains an object name. If there's no wildcard just yield
                # the needed URI.
                if not ContainsWildcard(self.wildcard_uri.object_name):
                    uri_to_yield = bucket_uri.clone_replace_name(
                        self.wildcard_uri.object_name)
                    yield BucketListingRef(uri_to_yield,
                                           key=None,
                                           prefix=None,
                                           headers=self.headers)
                else:
                    # URI contains a wildcard. Expand iteratively by building
                    # prefix/delimiter bucket listing request, filtering the results per
                    # the current level's wildcard, and continuing with the next component
                    # of the wildcard. See _BuildBucketFilterStrings() documentation
                    # for details.
                    #
                    # Initialize the iteration with bucket name from bucket_uri but
                    # object name from self.wildcard_uri. This is needed to handle cases
                    # where both the bucket and object names contain wildcards.
                    uris_needing_expansion = [
                        bucket_uri.clone_replace_name(
                            self.wildcard_uri.object_name)
                    ]
                    while len(uris_needing_expansion) > 0:
                        uri = uris_needing_expansion.pop(0)
                        (prefix, delimiter, prefix_wildcard,
                         suffix_wildcard) = (self._BuildBucketFilterStrings(
                             uri.object_name))
                        prog = re.compile(fnmatch.translate(prefix_wildcard))
                        # List bucket for objects matching prefix up to delimiter.
                        for key in bucket_uri.list_bucket(
                                prefix=prefix,
                                delimiter=delimiter,
                                headers=self.headers,
                                all_versions=self.all_versions):
                            # Check that the prefix regex matches rstripped key.name (to
                            # correspond with the rstripped prefix_wildcard from
                            # _BuildBucketFilterStrings()).
                            keyname = key.name
                            if isinstance(key, Prefix):
                                keyname = keyname.rstrip('/')
                            if prog.match(keyname):
                                if suffix_wildcard and keyname != suffix_wildcard:
                                    if isinstance(key, Prefix):
                                        # There's more wildcard left to expand.
                                        uris_needing_expansion.append(
                                            uri.clone_replace_name(
                                                key.name.rstrip('/') + '/' +
                                                suffix_wildcard))
                                else:
                                    # Done expanding.
                                    expanded_uri = uri.clone_replace_key(key)

                                    if isinstance(key, Prefix):
                                        yield BucketListingRef(
                                            expanded_uri,
                                            key=None,
                                            prefix=key,
                                            headers=self.headers)
                                    else:
                                        if self.all_versions:
                                            yield BucketListingRef(
                                                expanded_uri,
                                                key=key,
                                                prefix=None,
                                                headers=self.headers)
                                        else:
                                            # Yield BLR wrapping version-less URI.
                                            yield BucketListingRef(
                                                expanded_uri.
                                                clone_replace_name(
                                                    expanded_uri.object_name),
                                                key=key,
                                                prefix=None,
                                                headers=self.headers)
Beispiel #6
0
    def ExpandWildcardsAndContainers(self,
                                     uri_strs,
                                     recursion_requested,
                                     flat=True):
        """
    Expands wildcards, object-less bucket names, subdir bucket names, and
    directory names, producing a flat listing of all the matching objects/files.

    Args:
      uri_strs: List of URI strings needing expansion.
      recursion_requested: True if -R specified on command-line.
      flat: Bool indicating whether bucket listings should be flattened, i.e.,
          so the mapped-to results contain objects spanning subdirectories.

    Returns:
      gslib.name_expansion.NameExpansionResult.

    Raises:
      CommandException: if errors encountered.

    Examples with flat=True:
      - Calling with one of the uri_strs being 'gs://bucket' will enumerate all
        top-level objects, as will 'gs://bucket/' and 'gs://bucket/*'.
      - 'gs://bucket/**' will enumerate all objects in the bucket.
      - 'gs://bucket/abc' will enumerate all next-level objects under directory
        abc (i.e., not including subdirectories of abc) if gs://bucket/abc/*
        matches any objects; otherwise it will enumerate the single name
        gs://bucket/abc
      - 'gs://bucket/abc/**' will enumerate all objects under abc or any of its
        subdirectories.
      - 'file:///tmp' will enumerate all files under /tmp, as will
        'file:///tmp/*'
      - 'file:///tmp/**' will enumerate all files under /tmp or any of its
        subdirectories.

    Example if flat=False: calling with gs://bucket/abc/* lists matching objects
    or subdirs, but not sub-subdirs or objects beneath subdirs.

    Note: In step-by-step comments below we give examples assuming there's a
    gs://bucket with object paths:
      abcd/o1.txt
      abcd/o2.txt
      xyz/o1.txt
      xyz/o2.txt
    and a directory file://dir with file paths:
      dir/a.txt
      dir/b.txt
      dir/c/
    """
        result = NameExpansionResult()
        for uri_str in uri_strs:

            # Step 1: Expand any explicitly specified wildcards.
            # Starting with gs://buck*/abc* this step would expand to gs://bucket/abcd
            if ContainsWildcard(uri_str):
                post_step1_bucket_listing_refs = list(
                    self.WildcardIterator(uri_str))
            else:
                post_step1_bucket_listing_refs = [
                    BucketListingRef(self.suri_builder.StorageUri(uri_str))
                ]

            # Step 2: Expand subdirs.
            # Starting with gs://bucket/abcd this step would expand to:
            #   [abcd/o1.txt, abcd/o2.txt].
            uri_names_container = False
            if flat:
                if recursion_requested:
                    post_step2_bucket_listing_refs = []
                    for bucket_listing_ref in post_step1_bucket_listing_refs:
                        (uri_names_container, bucket_listing_refs) = (
                            self._DoImplicitBucketSubdirExpansionIfApplicable(
                                bucket_listing_ref.GetUri(), flat))
                        post_step2_bucket_listing_refs.extend(
                            bucket_listing_refs)
                else:
                    uri_names_container = False
                    post_step2_bucket_listing_refs = post_step1_bucket_listing_refs
            else:
                uri_names_container = False
                post_step2_bucket_listing_refs = post_step1_bucket_listing_refs

            # Step 3. Expand directories and buckets.
            # Starting with gs://bucket this step would expand to:
            #  [abcd/o1.txt, abcd/o2.txt, xyz/o1.txt, xyz/o2.txt]
            # Starting with file://dir this step would expand to:
            #  [dir/a.txt, dir/b.txt, dir/c/]
            exp_src_bucket_listing_refs = []
            wc = self._flatness_wildcard[flat]
            for bucket_listing_ref in post_step2_bucket_listing_refs:
                if (not bucket_listing_ref.GetUri().names_container()
                        and (flat or not bucket_listing_ref.HasPrefix())):
                    exp_src_bucket_listing_refs.append(bucket_listing_ref)
                    continue
                if not recursion_requested:
                    if bucket_listing_ref.GetUri().is_file_uri():
                        desc = 'directory'
                    else:
                        desc = 'bucket'
                    print 'Omitting %s "%s". (Did you mean to do %s -R?)' % (
                        desc, bucket_listing_ref.GetUri(), self.command_name)
                    continue
                uri_names_container = True
                if bucket_listing_ref.GetUri().is_file_uri():
                    # Convert dir to implicit recursive wildcard.
                    uri_to_iter = '%s/%s' % (bucket_listing_ref.GetUriString(),
                                             wc)
                else:
                    # Convert bucket to implicit recursive wildcard.
                    uri_to_iter = bucket_listing_ref.GetUri(
                    ).clone_replace_name(wc)
                wildcard_result = list(self.WildcardIterator(uri_to_iter))
                if len(wildcard_result) > 0:
                    exp_src_bucket_listing_refs.extend(wildcard_result)

            result._AddExpansion(self.suri_builder.StorageUri(uri_str),
                                 uri_names_container,
                                 exp_src_bucket_listing_refs)

        return result
Beispiel #7
0
  def __iter__(self):
    """Python iterator that gets called when iterating over cloud wildcard.

    Yields:
      BucketListingRef, or empty iterator if no matches.
    """
    # First handle bucket wildcarding, if any.
    if ContainsWildcard(self.wildcard_uri.bucket_name):
      regex = fnmatch.translate(self.wildcard_uri.bucket_name)
      bucket_uris = []
      prog = re.compile(regex)
      self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_BUCKET_ITERATOR,
                                                       self.wildcard_uri,
                                                       self.headers)
      for b in self.wildcard_uri.get_all_buckets(headers=self.headers):
        if prog.match(b.name):
          # Use str(b.name) because get_all_buckets() returns Unicode
          # string, which when used to construct x-goog-copy-src metadata
          # requests for object-to-object copies causes pathname '/' chars
          # to be entity-encoded (bucket%2Fdir instead of bucket/dir),
          # which causes the request to fail.
          uri_str = '%s://%s' % (self.wildcard_uri.scheme,
                                 urllib.quote_plus(str(b.name)))
          bucket_uris.append(
              boto.storage_uri(
                  uri_str, debug=self.debug,
                  bucket_storage_uri_class=self.bucket_storage_uri_class,
                  suppress_consec_slashes=False))
    else:
      bucket_uris = [self.wildcard_uri.clone_replace_name('')]

    # Now iterate over bucket(s), and handle object wildcarding, if any.
    self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_OBJECT_ITERATOR,
                                                     self.wildcard_uri,
                                                     self.headers)
    for bucket_uri in bucket_uris:
      if self.wildcard_uri.names_bucket():
        # Bucket-only URI.
        yield BucketListingRef(bucket_uri, key=None, prefix=None,
                               headers=self.headers)
      else:
        # URI contains an object name. If there's no wildcard just yield
        # the needed URI.
        if not ContainsWildcard(self.wildcard_uri.object_name):
          uri_to_yield = bucket_uri.clone_replace_name(
              self.wildcard_uri.object_name)
          yield BucketListingRef(uri_to_yield, key=None, prefix=None,
                          headers=self.headers)
        else:
          # URI contains a wildcard. Expand iteratively by making a prefix
          # query of the string preceding the first wildcard char, setting
          # delimiter=/ (unless the wildcard is **), then filtering the results
          # by the wildcard at that level. For example given the wildcard:
          #   gs://bucket/abc/d*e/f*.txt
          # we would:
          #   - get a bucket listing with prefix=abc/d, delimiter=/
          #   - filter each result for those that start with the result + *e
          # Assuming gs://bucket/abc/dxyze is a result from this iteration, the
          # next iteration would:
          #   - get a bucket listing with prefix= abc/dxyze, delimiter=/
          #   - filter each result for those that start with the result + f.txt
          #
          # Initialize the iteration with bucket name from bucket_uri but
          # object name from self.wildcard_uri. This is needed to handle cases
          # where both the bucket and object names contain wildcards.
          uris_needing_expansion = [
              bucket_uri.clone_replace_name(self.wildcard_uri.object_name)]
          while len(uris_needing_expansion) > 0:
            uri = uris_needing_expansion.pop(0)
            (prefix, delimiter, prefix_wildcard, suffix) = (
                self._BuildBucketFilterStrings(uri.object_name))
            prog = re.compile(fnmatch.translate(prefix_wildcard))
            # List bucket for objects matching prefix up to delimiter.
            for key in bucket_uri.get_bucket(
                validate=False, headers=self.headers).list(
                    prefix=prefix, delimiter=delimiter, headers=self.headers):
              # Check that the prefix regex matches.
              # Match rstripped key.name, to correspond with the rstripped
              # prefix_wildcard from _BuildBucketFilterStrings.
              if prog.match(key.name.rstrip('/')):
                if suffix and WILDCARD_REGEX.search(suffix):
                  # There's more wildcard left to expand.
                  uris_needing_expansion.append(
                      uri.clone_replace_name(key.name + suffix))
                else:
                  # Done expanding.
                  if suffix:
                    expanded_uri = uri.clone_replace_name(key.name + suffix)
                  else:
                    expanded_uri = uri.clone_replace_name(key.name)
                  if isinstance(key, Prefix):
                    yield BucketListingRef(expanded_uri, key=None, prefix=key,
                                           headers=self.headers)
                  else:
                    yield BucketListingRef(expanded_uri, key=key, prefix=None,
                                           headers=self.headers)