def RunCommand(self): """Command entry point for the compose command.""" target_url_str = self.args[-1] self.args = self.args[:-1] target_url = StorageUrlFromString(target_url_str) self.CheckProvider(target_url) if target_url.HasGeneration(): raise CommandException('A version-specific URL (%s) cannot be ' 'the destination for gsutil compose - abort.' % target_url) dst_obj_metadata = apitools_messages.Object(name=target_url.object_name, bucket=target_url.bucket_name) components = [] # Remember the first source object so we can get its content type. first_src_url = None for src_url_str in self.args: if ContainsWildcard(src_url_str): src_url_iter = self.WildcardIterator(src_url_str).IterObjects() else: src_url_iter = [BucketListingObject(StorageUrlFromString(src_url_str))] for blr in src_url_iter: src_url = blr.storage_url self.CheckProvider(src_url) if src_url.bucket_name != target_url.bucket_name: raise CommandException( 'GCS does not support inter-bucket composing.') if not first_src_url: first_src_url = src_url src_obj_metadata = ( apitools_messages.ComposeRequest.SourceObjectsValueListEntry( name=src_url.object_name)) if src_url.HasGeneration(): src_obj_metadata.generation = src_url.generation components.append(src_obj_metadata) # Avoid expanding too many components, and sanity check each name # expansion result. if len(components) > MAX_COMPOSE_ARITY: raise CommandException('"compose" called with too many component ' 'objects. Limit is %d.' % MAX_COMPOSE_ARITY) if not components: raise CommandException('"compose" requires at least 1 component object.') dst_obj_metadata.contentType = self.gsutil_api.GetObjectMetadata( first_src_url.bucket_name, first_src_url.object_name, provider=first_src_url.scheme, fields=['contentType']).contentType preconditions = PreconditionsFromHeaders(self.headers or {}) self.logger.info( 'Composing %s from %d component object(s).', target_url, len(components)) self.gsutil_api.ComposeObject( components, dst_obj_metadata, preconditions=preconditions, provider=target_url.scheme, encryption_tuple=GetEncryptionKeyWrapper(config))
def _EnumerateStorageUrls(self, in_urls): ret = [] for url_str in in_urls: if ContainsWildcard(url_str): ret.extend([blr.storage_url for blr in self.WildcardIterator(url_str)]) else: ret.append(StorageUrlFromString(url_str)) return ret
def RunCommand(self): """Command entry point for stat command.""" stat_fields = ENCRYPTED_FIELDS + UNENCRYPTED_FULL_LISTING_FIELDS found_nonmatching_arg = False for url_str in self.args: arg_matches = 0 url = StorageUrlFromString(url_str) if not url.IsObject(): raise CommandException( 'The stat command only works with object URLs') try: if ContainsWildcard(url_str): blr_iter = self.WildcardIterator(url_str).IterObjects( bucket_listing_fields=stat_fields) else: try: single_obj = self.gsutil_api.GetObjectMetadata( url.bucket_name, url.object_name, generation=url.generation, provider=url.scheme, fields=stat_fields) except EncryptionException: # Retry without requesting hashes. single_obj = self.gsutil_api.GetObjectMetadata( url.bucket_name, url.object_name, generation=url.generation, provider=url.scheme, fields=UNENCRYPTED_FULL_LISTING_FIELDS) blr_iter = [ BucketListingObject(url, root_object=single_obj) ] for blr in blr_iter: if blr.IsObject(): arg_matches += 1 # TODO: Request fewer fields if we're not printing the object. if logging.getLogger().isEnabledFor(logging.INFO): PrintFullInfoAboutObject(blr, incl_acl=False) except AccessDeniedException: if logging.getLogger().isEnabledFor(logging.INFO): sys.stderr.write( 'You aren\'t authorized to read %s - skipping' % url_str) except InvalidUrlError: raise except NotFoundException: pass if not arg_matches: if logging.getLogger().isEnabledFor(logging.INFO): sys.stderr.write(NO_URLS_MATCHED_TARGET % url_str) found_nonmatching_arg = True if found_nonmatching_arg: return 1 return 0
def _ExpandBucketWildcards(self, bucket_fields=None): """Expands bucket and provider wildcards. Builds a list of bucket url strings that can be iterated on. Args: bucket_fields: If present, populate only these metadata fields for buckets. Example value: ['acl', 'defaultObjectAcl'] Yields: BucketListingRefereneces of type BUCKET. """ bucket_url = StorageUrlFromString(self.wildcard_url.bucket_url_string) if (bucket_fields and set(bucket_fields) == set(['id']) and not ContainsWildcard(self.wildcard_url.bucket_name)): # If we just want the name of a non-wildcarded bucket URL, # don't make an RPC. yield BucketListingBucket(bucket_url) elif (self.wildcard_url.IsBucket() and not ContainsWildcard(self.wildcard_url.bucket_name)): # If we have a non-wildcarded bucket URL, get just that bucket. yield BucketListingBucket(bucket_url, root_object=self.gsutil_api.GetBucket( self.wildcard_url.bucket_name, provider=self.wildcard_url.scheme, fields=bucket_fields)) else: regex = fnmatch.translate(self.wildcard_url.bucket_name) prog = re.compile(regex) fields = self._GetToListFields(bucket_fields) if fields: fields.add('items/id') for bucket in self.gsutil_api.ListBuckets( fields=fields, project_id=self.project_id, provider=self.wildcard_url.scheme): if prog.match(bucket.id): url = StorageUrlFromString( '%s://%s/' % (self.wildcard_url.scheme, bucket.id)) yield BucketListingBucket(url, root_object=bucket)
def RunCommand(self): """Command entry point for stat command.""" # List of fields we'll print for stat objects. stat_fields = [ 'updated', 'cacheControl', 'contentDisposition', 'contentEncoding', 'contentLanguage', 'size', 'contentType', 'componentCount', 'metadata', 'crc32c', 'md5Hash', 'etag', 'generation', 'metageneration' ] found_nonmatching_arg = False for url_str in self.args: arg_matches = 0 url = StorageUrlFromString(url_str) if not url.IsObject(): raise CommandException( 'The stat command only works with object URLs') try: if ContainsWildcard(url_str): blr_iter = self.WildcardIterator(url_str).IterObjects( bucket_listing_fields=stat_fields) else: single_obj = self.gsutil_api.GetObjectMetadata( url.bucket_name, url.object_name, generation=url.generation, provider=url.scheme, fields=stat_fields) blr_iter = [ BucketListingObject(url, root_object=single_obj) ] for blr in blr_iter: if blr.IsObject(): arg_matches += 1 if logging.getLogger().isEnabledFor(logging.INFO): PrintFullInfoAboutObject(blr, incl_acl=False) except AccessDeniedException: if logging.getLogger().isEnabledFor(logging.INFO): sys.stderr.write( 'You aren\'t authorized to read %s - skipping' % url_str) except InvalidUrlError: raise except NotFoundException: pass if not arg_matches: if logging.getLogger().isEnabledFor(logging.INFO): sys.stderr.write('No URLs matched %s' % url_str) found_nonmatching_arg = True if found_nonmatching_arg: return 1 return 0
def RunCommand(self): """Command entry point for the ls command.""" got_nomatch_errors = False got_bucket_nomatch_errors = False listing_style = ListingStyle.SHORT get_bucket_info = False self.recursion_requested = False self.all_versions = False self.include_etag = False self.human_readable = False if self.sub_opts: for o, a in self.sub_opts: if o == '-a': self.all_versions = True elif o == '-e': self.include_etag = True elif o == '-b': get_bucket_info = True elif o == '-h': self.human_readable = True elif o == '-l': listing_style = ListingStyle.LONG elif o == '-L': listing_style = ListingStyle.LONG_LONG elif o == '-p': self.project_id = a elif o == '-r' or o == '-R': self.recursion_requested = True if not self.args: # default to listing all gs buckets self.args = ['gs://'] total_objs = 0 total_bytes = 0 def MaybePrintBucketHeader(blr): if len(self.args) > 1: print '%s:' % blr.url_string.encode(UTF8) print_bucket_header = MaybePrintBucketHeader for url_str in self.args: storage_url = StorageUrlFromString(url_str) if storage_url.IsFileUrl(): raise CommandException('Only cloud URLs are supported for %s' % self.command_name) bucket_fields = None if (listing_style == ListingStyle.SHORT or listing_style == ListingStyle.LONG): bucket_fields = ['id'] elif listing_style == ListingStyle.LONG_LONG: bucket_fields = [ 'location', 'storageClass', 'versioning', 'acl', 'defaultObjectAcl', 'website', 'logging', 'cors', 'lifecycle' ] if storage_url.IsProvider(): # Provider URL: use bucket wildcard to list buckets. for blr in self.WildcardIterator( '%s://*' % storage_url.scheme).IterBuckets( bucket_fields=bucket_fields): self._PrintBucketInfo(blr, listing_style) elif storage_url.IsBucket() and get_bucket_info: # ls -b bucket listing request: List info about bucket(s). total_buckets = 0 for blr in self.WildcardIterator(url_str).IterBuckets( bucket_fields=bucket_fields): if not ContainsWildcard(url_str) and not blr.root_object: # Iterator does not make an HTTP call for non-wildcarded # listings with fields=='id'. Ensure the bucket exists by calling # GetBucket. self.gsutil_api.GetBucket(blr.storage_url.bucket_name, fields=['id'], provider=storage_url.scheme) self._PrintBucketInfo(blr, listing_style) total_buckets += 1 if not ContainsWildcard(url_str) and not total_buckets: got_bucket_nomatch_errors = True else: # URL names a bucket, object, or object subdir -> # list matching object(s) / subdirs. def _PrintPrefixLong(blr): print '%-33s%s' % ('', blr.url_string.encode(UTF8)) if listing_style == ListingStyle.SHORT: # ls helper by default readies us for a short listing. ls_helper = LsHelper( self.WildcardIterator, self.logger, all_versions=self.all_versions, print_bucket_header_func=print_bucket_header, should_recurse=self.recursion_requested) elif listing_style == ListingStyle.LONG: bucket_listing_fields = ['name', 'updated', 'size'] if self.all_versions: bucket_listing_fields.extend( ['generation', 'metageneration']) if self.include_etag: bucket_listing_fields.append('etag') ls_helper = LsHelper( self.WildcardIterator, self.logger, print_object_func=self._PrintLongListing, print_dir_func=_PrintPrefixLong, print_bucket_header_func=print_bucket_header, all_versions=self.all_versions, should_recurse=self.recursion_requested, fields=bucket_listing_fields) elif listing_style == ListingStyle.LONG_LONG: # List all fields bucket_listing_fields = None ls_helper = LsHelper( self.WildcardIterator, self.logger, print_object_func=PrintFullInfoAboutObject, print_dir_func=_PrintPrefixLong, print_bucket_header_func=print_bucket_header, all_versions=self.all_versions, should_recurse=self.recursion_requested, fields=bucket_listing_fields) else: raise CommandException('Unknown listing style: %s' % listing_style) exp_dirs, exp_objs, exp_bytes = ls_helper.ExpandUrlAndPrint( storage_url) if storage_url.IsObject() and exp_objs == 0 and exp_dirs == 0: got_nomatch_errors = True total_bytes += exp_bytes total_objs += exp_objs if total_objs and listing_style != ListingStyle.SHORT: print('TOTAL: %d objects, %d bytes (%s)' % (total_objs, total_bytes, MakeHumanReadable(float(total_bytes)))) if got_nomatch_errors: raise CommandException('One or more URLs matched no objects.') if got_bucket_nomatch_errors: raise NotFoundException( 'One or more bucket URLs matched no buckets.') return 0
def __iter__(self, bucket_listing_fields=None, expand_top_level_buckets=False): """Iterator that gets called when iterating over the cloud wildcard. In the case where no wildcard is present, returns a single matching object, single matching prefix, or one of each if both exist. Args: bucket_listing_fields: Iterable fields to include in bucket listings. Ex. ['name', 'acl']. Iterator is responsible for converting these to list-style format ['items/name', 'items/acl'] as well as adding any fields necessary for listing such as prefixes. API implementation is responsible for adding pagination fields. If this is None, all fields are returned. expand_top_level_buckets: If true, yield no BUCKET references. Instead, expand buckets into top-level objects and prefixes. Yields: BucketListingRef of type BUCKET, OBJECT or PREFIX. """ single_version_request = self.wildcard_url.HasGeneration() # For wildcard expansion purposes, we need at a minimum the name of # each object and prefix. If we're not using the default of requesting # all fields, make sure at least these are requested. The Cloud API # tolerates specifying the same field twice. get_fields = None if bucket_listing_fields: get_fields = set() for field in bucket_listing_fields: get_fields.add(field) bucket_listing_fields = self._GetToListFields( get_fields=bucket_listing_fields) bucket_listing_fields.update(['items/name', 'prefixes']) get_fields.update(['name']) # If we're making versioned requests, ensure generation and # metageneration are also included. if single_version_request or self.all_versions: bucket_listing_fields.update( ['items/generation', 'items/metageneration']) get_fields.update(['generation', 'metageneration']) # Handle bucket wildcarding, if any, in _ExpandBucketWildcards. Then # iterate over the expanded bucket strings and handle any object # wildcarding. for bucket_listing_ref in self._ExpandBucketWildcards( bucket_fields=['id']): bucket_url_string = bucket_listing_ref.url_string if self.wildcard_url.IsBucket(): # IsBucket() guarantees there are no prefix or object wildcards, and # thus this is a top-level listing of buckets. if expand_top_level_buckets: url = StorageUrlFromString(bucket_url_string) for obj_or_prefix in self.gsutil_api.ListObjects( url.bucket_name, delimiter='/', all_versions=self.all_versions, provider=self.wildcard_url.scheme, fields=bucket_listing_fields): if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT: yield self._GetObjectRef( bucket_url_string, obj_or_prefix.data, with_version=self.all_versions) else: # CloudApi.CsObjectOrPrefixType.PREFIX: yield self._GetPrefixRef(bucket_url_string, obj_or_prefix.data) else: yield bucket_listing_ref else: # By default, assume a non-wildcarded URL is an object, not a prefix. # This prevents unnecessary listings (which are slower, more expensive, # and also subject to eventual consistency). if (not ContainsWildcard(self.wildcard_url.url_string) and self.wildcard_url.IsObject() and not self.all_versions): try: get_object = self.gsutil_api.GetObjectMetadata( self.wildcard_url.bucket_name, self.wildcard_url.object_name, generation=self.wildcard_url.generation, provider=self.wildcard_url.scheme, fields=get_fields) yield self._GetObjectRef( self.wildcard_url.bucket_url_string, get_object, with_version=(self.all_versions or single_version_request)) return except (NotFoundException, AccessDeniedException): # It's possible this is a prefix - try to list instead. pass # Expand iteratively by building prefix/delimiter bucket listing # request, filtering the results per the current level's wildcard # (if present), and continuing with the next component of the # wildcard. See _BuildBucketFilterStrings() documentation for details. if single_version_request: url_string = '%s%s#%s' % (bucket_url_string, self.wildcard_url.object_name, self.wildcard_url.generation) else: # Rstrip any prefixes to correspond with rstripped prefix wildcard # from _BuildBucketFilterStrings(). url_string = '%s%s' % ( bucket_url_string, StripOneSlash(self.wildcard_url.object_name) or '/' ) # Cover root object named '/' case. urls_needing_expansion = [url_string] while urls_needing_expansion: url = StorageUrlFromString(urls_needing_expansion.pop(0)) (prefix, delimiter, prefix_wildcard, suffix_wildcard) = (self._BuildBucketFilterStrings( url.object_name)) prog = re.compile(fnmatch.translate(prefix_wildcard)) # If we have a suffix wildcard, we only care about listing prefixes. listing_fields = (set(['prefixes']) if suffix_wildcard else bucket_listing_fields) # List bucket for objects matching prefix up to delimiter. for obj_or_prefix in self.gsutil_api.ListObjects( url.bucket_name, prefix=prefix, delimiter=delimiter, all_versions=self.all_versions or single_version_request, provider=self.wildcard_url.scheme, fields=listing_fields): if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT: gcs_object = obj_or_prefix.data if prog.match(gcs_object.name): if not suffix_wildcard or (StripOneSlash( gcs_object.name) == suffix_wildcard): if not single_version_request or ( self._SingleVersionMatches( gcs_object.generation)): yield self._GetObjectRef( bucket_url_string, gcs_object, with_version=( self.all_versions or single_version_request)) else: # CloudApi.CsObjectOrPrefixType.PREFIX prefix = obj_or_prefix.data if ContainsWildcard(prefix): # TODO: Disambiguate user-supplied strings from iterated # prefix and object names so that we can better reason # about wildcards and handle this case without raising an error. raise CommandException( 'Cloud folder %s%s contains a wildcard; gsutil does ' 'not currently support objects with wildcards in their ' 'name.' % (bucket_url_string, prefix)) # If the prefix ends with a slash, remove it. Note that we only # remove one slash so that we can successfully enumerate dirs # containing multiple slashes. rstripped_prefix = StripOneSlash(prefix) if prog.match(rstripped_prefix): if suffix_wildcard and rstripped_prefix != suffix_wildcard: # There's more wildcard left to expand. url_append_string = '%s%s' % ( bucket_url_string, rstripped_prefix + '/' + suffix_wildcard) urls_needing_expansion.append( url_append_string) else: # No wildcard to expand, just yield the prefix yield self._GetPrefixRef( bucket_url_string, prefix)
def testContainsWildcard(self): """Tests ContainsWildcard call.""" self.assertTrue(ContainsWildcard('a*.txt')) self.assertTrue(ContainsWildcard('a[0-9].txt')) self.assertFalse(ContainsWildcard('0-9.txt')) self.assertTrue(ContainsWildcard('?.txt'))
def RunCommand(self): """Command entry point for the du command.""" self.line_ending = '\n' self.all_versions = False self.produce_total = False self.human_readable = False self.summary_only = False self.exclude_patterns = [] if self.sub_opts: for o, a in self.sub_opts: if o == '-0': self.line_ending = '\0' elif o == '-a': self.all_versions = True elif o == '-c': self.produce_total = True elif o == '-e': self.exclude_patterns.append(a) elif o == '-h': self.human_readable = True elif o == '-s': self.summary_only = True elif o == '-X': if a == '-': f = sys.stdin else: f = open(a, 'r') try: for line in f: line = line.strip() if line: self.exclude_patterns.append(line) finally: f.close() if not self.args: # Default to listing all gs buckets. self.args = ['gs://'] total_bytes = 0 got_nomatch_errors = False def _PrintObjectLong(blr): return self._PrintInfoAboutBucketListingRef(blr) def _PrintNothing(unused_blr=None): pass def _PrintDirectory(num_bytes, name): if not self.summary_only: self._PrintSummaryLine(num_bytes, name) for url_arg in self.args: top_level_storage_url = StorageUrlFromString(url_arg) if top_level_storage_url.IsFileUrl(): raise CommandException('Only cloud URLs are supported for %s' % self.command_name) bucket_listing_fields = ['size'] ls_helper = LsHelper(self.WildcardIterator, self.logger, print_object_func=_PrintObjectLong, print_dir_func=_PrintNothing, print_dir_header_func=_PrintNothing, print_dir_summary_func=_PrintDirectory, print_newline_func=_PrintNothing, all_versions=self.all_versions, should_recurse=True, exclude_patterns=self.exclude_patterns, fields=bucket_listing_fields) # ls_helper expands to objects and prefixes, so perform a top-level # expansion first. if top_level_storage_url.IsProvider(): # Provider URL: use bucket wildcard to iterate over all buckets. top_level_iter = self.WildcardIterator( '%s://*' % top_level_storage_url.scheme).IterBuckets( bucket_fields=['id']) elif top_level_storage_url.IsBucket(): top_level_iter = self.WildcardIterator( '%s://%s' % (top_level_storage_url.scheme, top_level_storage_url.bucket_name)).IterBuckets( bucket_fields=['id']) else: top_level_iter = [BucketListingObject(top_level_storage_url)] for blr in top_level_iter: storage_url = blr.storage_url if storage_url.IsBucket() and self.summary_only: storage_url = StorageUrlFromString( storage_url.CreatePrefixUrl(wildcard_suffix='**')) _, exp_objs, exp_bytes = ls_helper.ExpandUrlAndPrint( storage_url) if (storage_url.IsObject() and exp_objs == 0 and ContainsWildcard(url_arg) and not self.exclude_patterns): got_nomatch_errors = True total_bytes += exp_bytes if self.summary_only: self._PrintSummaryLine(exp_bytes, blr.url_string.rstrip('/')) if self.produce_total: self._PrintSummaryLine(total_bytes, 'total') if got_nomatch_errors: raise CommandException('One or more URLs matched no objects.') return 0