def __iter__(self): wildcard = self.wildcard_uri.object_name match = re.search('\*\*', wildcard) if match: # Recursive wildcarding request ('.../**/...'). # Example input: wildcard = '/tmp/tmp2pQJAX/**/*' base_dir = wildcard[:match.start() - 1] remaining_wildcard = wildcard[match.start() + 2:] # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and # remaining_wildcard = '/*' if remaining_wildcard.startswith('*'): raise WildcardException( 'Invalid wildcard with more than 2 consecutive ' '*s (%s)' % wildcard) # If there was no remaining wildcard past the recursive wildcard, # treat it as if it were a '*'. For example, file://tmp/** is equivalent # to file://tmp/**/* if not remaining_wildcard: remaining_wildcard = '*' # Skip slash(es). remaining_wildcard = remaining_wildcard.lstrip(os.sep) filepaths = self._iter_dir(base_dir, remaining_wildcard) else: # Not a recursive wildcarding request. filepaths = glob.iglob(wildcard) for filepath in filepaths: expanded_uri = self.wildcard_uri.clone_replace_name(filepath) yield BucketListingRef(expanded_uri)
def _DoImplicitBucketSubdirExpansionIfApplicable(self, uri, flat): """ Checks whether uri could be an implicit bucket subdir, and expands if so; else returns list containing uri. For example gs://abc would be an implicit bucket subdir if the -R option was specified and gs://abc/* matches anything. Can only be called for -R (recursion requested). Args: uri: StorageUri. flat: bool indicating whether bucket listings should be flattened, i.e., so the mapped-to results contain objects spanning subdirectories. Returns: tuple (names_container, [BucketListingRefs to which uri expanded]) where names_container is true if URI names a directory, bucket, or bucket subdir (vs how StorageUri.names_container() doesn't handle latter case). """ names_container = False result_list = [] if uri.names_object(): # URI could be a bucket subdir. implicit_subdir_matches = list( self.WildcardIterator( self.suri_builder.StorageUri( '%s/%s' % (uri.uri.rstrip('/'), self._flatness_wildcard[flat])))) if len(implicit_subdir_matches) > 0: names_container = True result_list.extend(implicit_subdir_matches) else: result_list.append(BucketListingRef(uri)) else: result_list.append(BucketListingRef(uri)) return (names_container, result_list)
def __iter__(self): empty = True for blr in self.blr_iter: uri = blr.GetUri() if not uri.names_object(): empty = False yield (True, blr) break for key in uri.list_bucket( prefix=uri.object_name, headers=self.headers, all_versions=True): if key.name != uri.object_name: # The desired entries will be alphabetically first in this listing. break version_blr = BucketListingRef(uri.clone_replace_key(key), key=key) empty = False yield (False, version_blr) # If no version exists, yield the unversioned blr, and let the consuming # operation fail. This mirrors behavior in _ImplicitBucketSubdirIterator. if empty: yield (False, blr)
def __iter__(self): for uri_str in self.uri_strs: # Step 1: Expand any explicitly specified wildcards. The output from this # step is an iterator of BucketListingRef. # Starting with gs://buck*/abc* this step would expand to gs://bucket/abcd if ContainsWildcard(uri_str): post_step1_iter = self._WildcardIterator(uri_str) else: suri = self.suri_builder.StorageUri(uri_str) post_step1_iter = iter([BucketListingRef(suri)]) post_step1_iter = PluralityCheckableIterator(post_step1_iter) # Step 2: Expand bucket subdirs and versions. The output from this # step is an iterator of (names_container, BucketListingRef). # Starting with gs://bucket/abcd this step would expand to: # iter([(True, abcd/o1.txt), (True, abcd/o2.txt)]). if self.flat and self.recursion_requested: post_step2_iter = _ImplicitBucketSubdirIterator( self, post_step1_iter, self.flat) elif self.all_versions: post_step2_iter = _AllVersionIterator(self, post_step1_iter, headers=self.headers) else: post_step2_iter = _NonContainerTuplifyIterator(post_step1_iter) post_step2_iter = PluralityCheckableIterator(post_step2_iter) # Step 3. Expand directories and buckets. This step yields the iterated # values. Starting with gs://bucket this step would expand to: # [abcd/o1.txt, abcd/o2.txt, xyz/o1.txt, xyz/o2.txt] # Starting with file://dir this step would expand to: # [dir/a.txt, dir/b.txt, dir/c/] exp_src_bucket_listing_refs = [] wc = self._flatness_wildcard[self.flat] src_uri_expands_to_multi = (post_step1_iter.has_plurality() or post_step2_iter.has_plurality()) is_multi_src_request = (self.uri_strs.has_plurality() or src_uri_expands_to_multi) if post_step2_iter.is_empty(): raise CommandException('No URIs matched: %s' % uri_str) for (names_container, blr) in post_step2_iter: if (not blr.GetUri().names_container() and (self.flat or not blr.HasPrefix())): yield NameExpansionResult(uri_str, is_multi_src_request, src_uri_expands_to_multi, names_container, blr.GetUriString(), self.have_existing_dst_container, is_latest=blr.IsLatest()) continue if not self.recursion_requested: if blr.GetUri().is_file_uri(): desc = 'directory' else: desc = 'bucket' print 'Omitting %s "%s". (Did you mean to do %s -R?)' % ( desc, blr.GetUri(), self.command_name) continue if blr.GetUri().is_file_uri(): # Convert dir to implicit recursive wildcard. uri_to_iterate = '%s/%s' % (blr.GetUriString(), wc) else: # Convert bucket to implicit recursive wildcard. uri_to_iterate = blr.GetUri().clone_replace_name(wc) wc_iter = PluralityCheckableIterator( self._WildcardIterator(uri_to_iterate)) src_uri_expands_to_multi = (src_uri_expands_to_multi or wc_iter.has_plurality()) is_multi_src_request = (self.uri_strs.has_plurality() or src_uri_expands_to_multi) for blr in wc_iter: yield NameExpansionResult(uri_str, is_multi_src_request, src_uri_expands_to_multi, True, blr.GetUriString(), self.have_existing_dst_container, is_latest=blr.IsLatest())
def __iter__(self): """Python iterator that gets called when iterating over cloud wildcard. Yields: BucketListingRef, or empty iterator if no matches. """ # First handle bucket wildcarding, if any. if ContainsWildcard(self.wildcard_uri.bucket_name): regex = fnmatch.translate(self.wildcard_uri.bucket_name) bucket_uris = [] prog = re.compile(regex) self.proj_id_handler.FillInProjectHeaderIfNeeded( WILDCARD_BUCKET_ITERATOR, self.wildcard_uri, self.headers) for b in self.wildcard_uri.get_all_buckets(headers=self.headers): if prog.match(b.name): # Use str(b.name) because get_all_buckets() returns Unicode # string, which when used to construct x-goog-copy-src metadata # requests for object-to-object copies causes pathname '/' chars # to be entity-encoded (bucket%2Fdir instead of bucket/dir), # which causes the request to fail. uri_str = '%s://%s' % (self.wildcard_uri.scheme, urllib.quote_plus(str(b.name))) # TODO: Move bucket_uris to a separate generator function that yields # values instead of pre-computing the list. bucket_uris.append( boto.storage_uri(uri_str, debug=self.debug, bucket_storage_uri_class=self. bucket_storage_uri_class, suppress_consec_slashes=False)) else: bucket_uris = [self.wildcard_uri.clone_replace_name('')] # Now iterate over bucket(s), and handle object wildcarding, if any. self.proj_id_handler.FillInProjectHeaderIfNeeded( WILDCARD_OBJECT_ITERATOR, self.wildcard_uri, self.headers) for bucket_uri in bucket_uris: if self.wildcard_uri.names_bucket(): # Bucket-only URI. yield BucketListingRef(bucket_uri, key=None, prefix=None, headers=self.headers) else: # URI contains an object name. If there's no wildcard just yield # the needed URI. if not ContainsWildcard(self.wildcard_uri.object_name): uri_to_yield = bucket_uri.clone_replace_name( self.wildcard_uri.object_name) yield BucketListingRef(uri_to_yield, key=None, prefix=None, headers=self.headers) else: # URI contains a wildcard. Expand iteratively by building # prefix/delimiter bucket listing request, filtering the results per # the current level's wildcard, and continuing with the next component # of the wildcard. See _BuildBucketFilterStrings() documentation # for details. # # Initialize the iteration with bucket name from bucket_uri but # object name from self.wildcard_uri. This is needed to handle cases # where both the bucket and object names contain wildcards. uris_needing_expansion = [ bucket_uri.clone_replace_name( self.wildcard_uri.object_name) ] while len(uris_needing_expansion) > 0: uri = uris_needing_expansion.pop(0) (prefix, delimiter, prefix_wildcard, suffix_wildcard) = (self._BuildBucketFilterStrings( uri.object_name)) prog = re.compile(fnmatch.translate(prefix_wildcard)) # List bucket for objects matching prefix up to delimiter. for key in bucket_uri.list_bucket( prefix=prefix, delimiter=delimiter, headers=self.headers, all_versions=self.all_versions): # Check that the prefix regex matches rstripped key.name (to # correspond with the rstripped prefix_wildcard from # _BuildBucketFilterStrings()). keyname = key.name if isinstance(key, Prefix): keyname = keyname.rstrip('/') if prog.match(keyname): if suffix_wildcard and keyname != suffix_wildcard: if isinstance(key, Prefix): # There's more wildcard left to expand. uris_needing_expansion.append( uri.clone_replace_name( key.name.rstrip('/') + '/' + suffix_wildcard)) else: # Done expanding. expanded_uri = uri.clone_replace_key(key) if isinstance(key, Prefix): yield BucketListingRef( expanded_uri, key=None, prefix=key, headers=self.headers) else: if self.all_versions: yield BucketListingRef( expanded_uri, key=key, prefix=None, headers=self.headers) else: # Yield BLR wrapping version-less URI. yield BucketListingRef( expanded_uri. clone_replace_name( expanded_uri.object_name), key=key, prefix=None, headers=self.headers)
def ExpandWildcardsAndContainers(self, uri_strs, recursion_requested, flat=True): """ Expands wildcards, object-less bucket names, subdir bucket names, and directory names, producing a flat listing of all the matching objects/files. Args: uri_strs: List of URI strings needing expansion. recursion_requested: True if -R specified on command-line. flat: Bool indicating whether bucket listings should be flattened, i.e., so the mapped-to results contain objects spanning subdirectories. Returns: gslib.name_expansion.NameExpansionResult. Raises: CommandException: if errors encountered. Examples with flat=True: - Calling with one of the uri_strs being 'gs://bucket' will enumerate all top-level objects, as will 'gs://bucket/' and 'gs://bucket/*'. - 'gs://bucket/**' will enumerate all objects in the bucket. - 'gs://bucket/abc' will enumerate all next-level objects under directory abc (i.e., not including subdirectories of abc) if gs://bucket/abc/* matches any objects; otherwise it will enumerate the single name gs://bucket/abc - 'gs://bucket/abc/**' will enumerate all objects under abc or any of its subdirectories. - 'file:///tmp' will enumerate all files under /tmp, as will 'file:///tmp/*' - 'file:///tmp/**' will enumerate all files under /tmp or any of its subdirectories. Example if flat=False: calling with gs://bucket/abc/* lists matching objects or subdirs, but not sub-subdirs or objects beneath subdirs. Note: In step-by-step comments below we give examples assuming there's a gs://bucket with object paths: abcd/o1.txt abcd/o2.txt xyz/o1.txt xyz/o2.txt and a directory file://dir with file paths: dir/a.txt dir/b.txt dir/c/ """ result = NameExpansionResult() for uri_str in uri_strs: # Step 1: Expand any explicitly specified wildcards. # Starting with gs://buck*/abc* this step would expand to gs://bucket/abcd if ContainsWildcard(uri_str): post_step1_bucket_listing_refs = list( self.WildcardIterator(uri_str)) else: post_step1_bucket_listing_refs = [ BucketListingRef(self.suri_builder.StorageUri(uri_str)) ] # Step 2: Expand subdirs. # Starting with gs://bucket/abcd this step would expand to: # [abcd/o1.txt, abcd/o2.txt]. uri_names_container = False if flat: if recursion_requested: post_step2_bucket_listing_refs = [] for bucket_listing_ref in post_step1_bucket_listing_refs: (uri_names_container, bucket_listing_refs) = ( self._DoImplicitBucketSubdirExpansionIfApplicable( bucket_listing_ref.GetUri(), flat)) post_step2_bucket_listing_refs.extend( bucket_listing_refs) else: uri_names_container = False post_step2_bucket_listing_refs = post_step1_bucket_listing_refs else: uri_names_container = False post_step2_bucket_listing_refs = post_step1_bucket_listing_refs # Step 3. Expand directories and buckets. # Starting with gs://bucket this step would expand to: # [abcd/o1.txt, abcd/o2.txt, xyz/o1.txt, xyz/o2.txt] # Starting with file://dir this step would expand to: # [dir/a.txt, dir/b.txt, dir/c/] exp_src_bucket_listing_refs = [] wc = self._flatness_wildcard[flat] for bucket_listing_ref in post_step2_bucket_listing_refs: if (not bucket_listing_ref.GetUri().names_container() and (flat or not bucket_listing_ref.HasPrefix())): exp_src_bucket_listing_refs.append(bucket_listing_ref) continue if not recursion_requested: if bucket_listing_ref.GetUri().is_file_uri(): desc = 'directory' else: desc = 'bucket' print 'Omitting %s "%s". (Did you mean to do %s -R?)' % ( desc, bucket_listing_ref.GetUri(), self.command_name) continue uri_names_container = True if bucket_listing_ref.GetUri().is_file_uri(): # Convert dir to implicit recursive wildcard. uri_to_iter = '%s/%s' % (bucket_listing_ref.GetUriString(), wc) else: # Convert bucket to implicit recursive wildcard. uri_to_iter = bucket_listing_ref.GetUri( ).clone_replace_name(wc) wildcard_result = list(self.WildcardIterator(uri_to_iter)) if len(wildcard_result) > 0: exp_src_bucket_listing_refs.extend(wildcard_result) result._AddExpansion(self.suri_builder.StorageUri(uri_str), uri_names_container, exp_src_bucket_listing_refs) return result
def __iter__(self): """Python iterator that gets called when iterating over cloud wildcard. Yields: BucketListingRef, or empty iterator if no matches. """ # First handle bucket wildcarding, if any. if ContainsWildcard(self.wildcard_uri.bucket_name): regex = fnmatch.translate(self.wildcard_uri.bucket_name) bucket_uris = [] prog = re.compile(regex) self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_BUCKET_ITERATOR, self.wildcard_uri, self.headers) for b in self.wildcard_uri.get_all_buckets(headers=self.headers): if prog.match(b.name): # Use str(b.name) because get_all_buckets() returns Unicode # string, which when used to construct x-goog-copy-src metadata # requests for object-to-object copies causes pathname '/' chars # to be entity-encoded (bucket%2Fdir instead of bucket/dir), # which causes the request to fail. uri_str = '%s://%s' % (self.wildcard_uri.scheme, urllib.quote_plus(str(b.name))) bucket_uris.append( boto.storage_uri( uri_str, debug=self.debug, bucket_storage_uri_class=self.bucket_storage_uri_class, suppress_consec_slashes=False)) else: bucket_uris = [self.wildcard_uri.clone_replace_name('')] # Now iterate over bucket(s), and handle object wildcarding, if any. self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_OBJECT_ITERATOR, self.wildcard_uri, self.headers) for bucket_uri in bucket_uris: if self.wildcard_uri.names_bucket(): # Bucket-only URI. yield BucketListingRef(bucket_uri, key=None, prefix=None, headers=self.headers) else: # URI contains an object name. If there's no wildcard just yield # the needed URI. if not ContainsWildcard(self.wildcard_uri.object_name): uri_to_yield = bucket_uri.clone_replace_name( self.wildcard_uri.object_name) yield BucketListingRef(uri_to_yield, key=None, prefix=None, headers=self.headers) else: # URI contains a wildcard. Expand iteratively by making a prefix # query of the string preceding the first wildcard char, setting # delimiter=/ (unless the wildcard is **), then filtering the results # by the wildcard at that level. For example given the wildcard: # gs://bucket/abc/d*e/f*.txt # we would: # - get a bucket listing with prefix=abc/d, delimiter=/ # - filter each result for those that start with the result + *e # Assuming gs://bucket/abc/dxyze is a result from this iteration, the # next iteration would: # - get a bucket listing with prefix= abc/dxyze, delimiter=/ # - filter each result for those that start with the result + f.txt # # Initialize the iteration with bucket name from bucket_uri but # object name from self.wildcard_uri. This is needed to handle cases # where both the bucket and object names contain wildcards. uris_needing_expansion = [ bucket_uri.clone_replace_name(self.wildcard_uri.object_name)] while len(uris_needing_expansion) > 0: uri = uris_needing_expansion.pop(0) (prefix, delimiter, prefix_wildcard, suffix) = ( self._BuildBucketFilterStrings(uri.object_name)) prog = re.compile(fnmatch.translate(prefix_wildcard)) # List bucket for objects matching prefix up to delimiter. for key in bucket_uri.get_bucket( validate=False, headers=self.headers).list( prefix=prefix, delimiter=delimiter, headers=self.headers): # Check that the prefix regex matches. # Match rstripped key.name, to correspond with the rstripped # prefix_wildcard from _BuildBucketFilterStrings. if prog.match(key.name.rstrip('/')): if suffix and WILDCARD_REGEX.search(suffix): # There's more wildcard left to expand. uris_needing_expansion.append( uri.clone_replace_name(key.name + suffix)) else: # Done expanding. if suffix: expanded_uri = uri.clone_replace_name(key.name + suffix) else: expanded_uri = uri.clone_replace_name(key.name) if isinstance(key, Prefix): yield BucketListingRef(expanded_uri, key=None, prefix=key, headers=self.headers) else: yield BucketListingRef(expanded_uri, key=key, prefix=None, headers=self.headers)