def get_component_count(file_size, target_component_size, max_components): """Returns the # components a file would be split into for a composite upload. Args: file_size (int|None): Total byte size of file being divided into components. None if could not be determined. target_component_size (int|str): Target size for each component if not total components isn't capped by max_components. May be byte count int or size string (e.g. "50M"). max_components (int|None): Limit on allowed components regardless of file_size and target_component_size. None indicates no limit. Returns: int: Number of components to split file into for composite upload. """ if file_size is None: return 1 if isinstance(target_component_size, int): target_component_size_bytes = target_component_size else: target_component_size_bytes = scaled_integer.ParseInteger( target_component_size) return min(math.ceil(file_size / target_component_size_bytes), max_components if max_components is not None else float('inf'))
def run(self): if self._serialization_data is not None: apitools_upload = transfer.Upload.FromData( self._source_stream, json.dumps(self._serialization_data), self._gcs_api.client.http, auto_transfer=False, gzip_encoded=self._request_config.gzip_encoded) else: apitools_upload = transfer.Upload( self._source_stream, self._content_type, auto_transfer=False, chunksize=scaled_integer.ParseInteger( properties.VALUES.storage.upload_chunk_size.Get()), gzip_encoded=self._request_config.gzip_encoded, total_size=self._request_config.size) apitools_upload.strategy = transfer.RESUMABLE_UPLOAD apitools_upload.bytes_http = self._http_client if not apitools_upload.initialized: self._gcs_api.client.objects.Insert( self._get_validated_insert_request(), upload=apitools_upload) if self._tracker_callback is not None: self._tracker_callback(apitools_upload.serialization_data) if self._request_config.gzip_encoded: http_response = apitools_upload.StreamInChunks() else: http_response = apitools_upload.StreamMedia() return self._gcs_api.client.objects.ProcessHttpResponse( self._gcs_api.client.objects.GetMethodConfig('Insert'), http_response)
def __init__(self, source_resource, destination_resource, delete_source=False, print_created_message=False, user_request_args=None): """Initializes task. Args: source_resource (resource_reference.FileObjectResource): Must contain local filesystem path to upload object. Does not need to contain metadata. destination_resource (resource_reference.ObjectResource|UnknownResource): Must contain the full object path. Directories will not be accepted. Existing objects at the this location will be overwritten. delete_source (bool): If copy completes successfully, delete the source object afterwards. print_created_message (bool): Print a message containing the versioned URL of the copy result. user_request_args (UserRequestArgs|None): Values for RequestConfig. """ super(FileUploadTask, self).__init__(source_resource, destination_resource, user_request_args=user_request_args) self._delete_source = delete_source self._print_created_message = print_created_message self.parallel_processing_key = ( self._destination_resource.storage_url.url_string) self._composite_upload_threshold = scaled_integer.ParseInteger( properties.VALUES.storage.parallel_composite_upload_threshold.Get( ))
def _get_component_count(file_size, api_max_component_count): """Returns the number of components to use for an upload.""" preferred_component_size = scaled_integer.ParseInteger( properties.VALUES.storage.parallel_composite_upload_component_size.Get()) component_count = math.ceil(file_size / preferred_component_size) if component_count < 2: return 2 if component_count > api_max_component_count: return api_max_component_count return component_count
def _should_perform_sliced_download(source_resource, destination_resource): """Returns True if conditions are right for a sliced download.""" if destination_resource.storage_url.is_pipe: # Can't write to different indices of pipe. return False if (not source_resource.crc32c_hash and properties.VALUES.storage.check_hashes.Get() != properties.CheckHashes.NEVER.value): # Do not perform sliced download if hash validation is not possible. return False threshold = scaled_integer.ParseInteger( properties.VALUES.storage.sliced_object_download_threshold.Get()) component_size = scaled_integer.ParseInteger( properties.VALUES.storage.sliced_object_download_component_size.Get()) # TODO(b/183017513): Only perform sliced downloads with parallelism. api_capabilities = api_factory.get_capabilities( source_resource.storage_url.scheme) return (source_resource.size and threshold != 0 and source_resource.size > threshold and component_size and cloud_api.Capability.SLICED_DOWNLOAD in api_capabilities and task_util.should_use_parallelism())
def __init__(self): """Initializes response handler for requests downloads.""" super(_StorageStreamResponseHandler, self).__init__(use_stream=True) self._stream = None self._digesters = {} self._processed_bytes = 0, self._progress_callback = None self._chunk_size = scaled_integer.ParseInteger( properties.VALUES.storage.download_chunk_size.Get()) # If progress callbacks is called more frequently than every 512 KB, it # can degrate performance. self._progress_callback_threshold = max( MINIMUM_PROGRESS_CALLBACK_THRESHOLD, self._chunk_size)
def Convert(self, string): if not string: return None try: value = scaled_integer.ParseInteger( string, default_unit=self._default_unit, type_abbr=self._type_abbr) if self._output_unit_value: value //= self._output_unit_value return value except ValueError as e: raise exceptions.ParseError( self.GetPresentationName(), 'Failed to parse binary/decimal scaled integer [{}]: {}.'.format( string, _SubException(e)))
def download_object(self, cloud_resource, download_stream, compressed_encoding=False, decryption_wrapper=None, digesters=None, download_strategy=cloud_api.DownloadStrategy.ONE_SHOT, progress_callback=None, start_byte=0, end_byte=None): """See super class.""" extra_args = {} if cloud_resource.generation: extra_args['VersionId'] = cloud_resource.generation if download_strategy == cloud_api.DownloadStrategy.RESUMABLE: response = self.client.get_object( Bucket=cloud_resource.bucket, Key=cloud_resource.name, Range='bytes={}-'.format(start_byte), ) processed_bytes = start_byte for chunk in response['Body'].iter_chunks( scaled_integer.ParseInteger( properties.VALUES.storage.download_chunk_size.Get())): download_stream.write(chunk) processed_bytes += len(chunk) if progress_callback: progress_callback(processed_bytes) else: # TODO(b/172480278) Conditionally call get_object for smaller object. self.client.download_fileobj(cloud_resource.bucket, cloud_resource.name, download_stream, Callback=progress_callback, ExtraArgs=extra_args) # Download callback doesn't give us streaming data, so we have to # read whole downloaded file to update digests. if digesters: with files.BinaryFileReader( download_stream.name) as completed_download_stream: completed_download_stream.seek(0) for hash_algorithm in digesters: digesters[ hash_algorithm] = hash_util.get_hash_from_file_stream( completed_download_stream, hash_algorithm) return self._get_content_encoding(cloud_resource)
def _GetChunkSize(self): """Returns the property defined chunksize corrected for server granularity. Chunk size for GCS must be a multiple of 256 KiB. This functions rounds up the property defined chunk size to the nearest chunk size interval. """ gcs_chunk_granularity = 256 * 1024 # 256 KiB chunksize = scaled_integer.ParseInteger( properties.VALUES.storage.upload_chunk_size.Get()) if chunksize == 0: chunksize = None # Use apitools default (1048576 B) elif chunksize % gcs_chunk_granularity != 0: chunksize += gcs_chunk_granularity - (chunksize % gcs_chunk_granularity) return chunksize
def _get_upload(self): """Returns an apitools upload class used for a new transfer.""" resource_args = self._request_config.resource_args size = getattr(resource_args, 'size', None) max_retries = properties.VALUES.storage.max_retries.GetInt() apitools_upload = transfer.Upload( self._source_stream, resource_args.content_type, auto_transfer=False, chunksize=scaled_integer.ParseInteger( properties.VALUES.storage.upload_chunk_size.Get()), gzip_encoded=self._should_gzip_in_flight, total_size=size, num_retries=max_retries) apitools_upload.strategy = transfer.RESUMABLE_UPLOAD return apitools_upload
def get_upload_strategy(api, object_length): """Determines if resumbale uplaod should be performed. Args: api (CloudApi): An api instance to check if it supports resumable upload. object_length (int): Length of the data to be uploaded. Returns: bool: True if resumable upload can be performed. """ resumable_threshold = scaled_integer.ParseInteger( properties.VALUES.storage.resumable_threshold.Get()) if (object_length >= resumable_threshold and cloud_api.Capability.RESUMABLE_UPLOAD in api.capabilities): return cloud_api.UploadStrategy.RESUMABLE else: return cloud_api.UploadStrategy.SIMPLE
def __init__(self, source_resource, destination_resource, delete_source=False, do_not_decompress=False, print_created_message=False, user_request_args=None): """Initializes task. Args: source_resource (ObjectResource): Must contain the full path of object to download, including bucket. Directories will not be accepted. Does not need to contain metadata. destination_resource (FileObjectResource|UnknownResource): Must contain local filesystem path to destination object. Does not need to contain metadata. delete_source (bool): If copy completes successfully, delete the source object afterwards. do_not_decompress (bool): Prevents automatically decompressing downloaded gzips. print_created_message (bool): Print a message containing the versioned URL of the copy result. user_request_args (UserRequestArgs|None): Values for RequestConfig. """ super(FileDownloadTask, self).__init__(source_resource, destination_resource, user_request_args=user_request_args) self._delete_source = delete_source self._do_not_decompress = do_not_decompress self._print_created_message = print_created_message self._temporary_destination_resource = ( self._get_temporary_destination_resource()) if (self._source_resource.size and self._source_resource.size >= scaled_integer.ParseInteger( properties.VALUES.storage.resumable_threshold.Get())): self._strategy = cloud_api.DownloadStrategy.RESUMABLE else: self._strategy = cloud_api.DownloadStrategy.ONE_SHOT self.parallel_processing_key = ( self._destination_resource.storage_url.url_string)
def __init__(self, source_resource, destination_resource): """Initializes task. Args: source_resource (resource_reference.FileObjectResource): Must contain local filesystem path to upload object. Does not need to contain metadata. destination_resource (resource_reference.ObjectResource|UnknownResource): Must contain the full object path. Directories will not be accepted. Existing objects at the this location will be overwritten. """ super(FileUploadTask, self).__init__() self._source_resource = source_resource self._destination_resource = destination_resource self.parallel_processing_key = ( self._destination_resource.storage_url.url_string) self._composite_upload_threshold = scaled_integer.ParseInteger( properties.VALUES.storage.parallel_composite_upload_threshold.Get())
def _download_object(self, cloud_resource, download_stream, digesters, progress_callback, start_byte): get_object_args = { 'Bucket': cloud_resource.bucket, 'Key': cloud_resource.name, 'Range': 'bytes={}-'.format(start_byte), } if cloud_resource.generation is not None: get_object_args['VersionId'] = str(cloud_resource.generation) response = self.client.get_object(**get_object_args) processed_bytes = start_byte for chunk in response['Body'].iter_chunks( scaled_integer.ParseInteger( properties.VALUES.storage.download_chunk_size.Get())): download_stream.write(chunk) for hash_object in digesters.values(): hash_object.update(chunk) processed_bytes += len(chunk) if progress_callback: progress_callback(processed_bytes) return response.get('ContentEncoding')
def copy_object(self, source_resource, destination_resource, progress_callback=None, request_config=None): """See super class.""" # TODO(b/161898251): Implement encryption and decryption. if not request_config: request_config = GcsRequestConfig() destination_metadata = getattr(destination_resource, 'metadata', None) if not destination_metadata: destination_metadata = gcs_metadata_util.get_apitools_metadata_from_url( destination_resource.storage_url) if source_resource.metadata: gcs_metadata_util.copy_select_object_metadata( source_resource.metadata, destination_metadata) if request_config.max_bytes_per_call: max_bytes_per_call = request_config.max_bytes_per_call else: max_bytes_per_call = scaled_integer.ParseInteger( properties.VALUES.storage.copy_chunk_size.Get()) if request_config.predefined_acl_string: predefined_acl = getattr( self.messages.StorageObjectsRewriteRequest. DestinationPredefinedAclValueValuesEnum, request_config.predefined_acl_string) else: predefined_acl = None if source_resource.generation is None: source_generation = None else: source_generation = int(source_resource.generation) tracker_file_path = tracker_file_util.get_tracker_file_path( destination_resource.storage_url, tracker_file_util.TrackerFileType.REWRITE, source_resource.storage_url) rewrite_parameters_hash = tracker_file_util.hash_gcs_rewrite_parameters_for_tracker_file( source_resource, destination_resource, destination_metadata, request_config=request_config) try: resume_rewrite_token = tracker_file_util.read_rewrite_tracker_file( tracker_file_path, rewrite_parameters_hash) log.debug('Found rewrite token. Resuming copy.') except files.MissingFileError: resume_rewrite_token = None log.debug('No rewrite token found. Starting copy from scratch.') while True: request = self.messages.StorageObjectsRewriteRequest( sourceBucket=source_resource.storage_url.bucket_name, sourceObject=source_resource.storage_url.object_name, destinationBucket=destination_resource.storage_url.bucket_name, destinationObject=destination_resource.storage_url.object_name, object=destination_metadata, sourceGeneration=source_generation, ifGenerationMatch=request_config.precondition_generation_match, ifMetagenerationMatch=( request_config.precondition_metageneration_match), destinationPredefinedAcl=predefined_acl, rewriteToken=resume_rewrite_token, maxBytesRewrittenPerCall=max_bytes_per_call) rewrite_response = self.client.objects.Rewrite(request) processed_bytes = rewrite_response.totalBytesRewritten if progress_callback: progress_callback(processed_bytes) if rewrite_response.done: break elif not resume_rewrite_token: resume_rewrite_token = rewrite_response.rewriteToken tracker_file_util.write_rewrite_tracker_file( tracker_file_path, rewrite_parameters_hash, rewrite_response.rewriteToken) tracker_file_util.delete_tracker_file(tracker_file_path) return gcs_metadata_util.get_object_resource_from_metadata( rewrite_response.resource)
def copy_object(self, source_resource, destination_resource, request_config, progress_callback=None): """See super class.""" destination_metadata = getattr(destination_resource, 'metadata', None) if not destination_metadata: destination_metadata = gcs_metadata_util.get_apitools_metadata_from_url( destination_resource.storage_url) if source_resource.metadata: gcs_metadata_util.copy_select_object_metadata( source_resource.metadata, destination_metadata, request_config) gcs_metadata_util.update_object_metadata_from_request_config( destination_metadata, request_config) if request_config.max_bytes_per_call: max_bytes_per_call = request_config.max_bytes_per_call else: max_bytes_per_call = scaled_integer.ParseInteger( properties.VALUES.storage.copy_chunk_size.Get()) if request_config.predefined_acl_string: predefined_acl = getattr( self.messages.StorageObjectsRewriteRequest. DestinationPredefinedAclValueValuesEnum, request_config.predefined_acl_string) else: predefined_acl = None if source_resource.generation is None: source_generation = None else: source_generation = int(source_resource.generation) tracker_file_path = tracker_file_util.get_tracker_file_path( destination_resource.storage_url, tracker_file_util.TrackerFileType.REWRITE, source_url=source_resource.storage_url) rewrite_parameters_hash = tracker_file_util.hash_gcs_rewrite_parameters_for_tracker_file( source_resource, destination_resource, destination_metadata, request_config=request_config) try: resume_rewrite_token = tracker_file_util.read_rewrite_tracker_file( tracker_file_path, rewrite_parameters_hash) log.debug('Found rewrite token. Resuming copy.') except files.MissingFileError: resume_rewrite_token = None log.debug('No rewrite token found. Starting copy from scratch.') with self._encryption_headers_for_rewrite_call_context(request_config): while True: request = self.messages.StorageObjectsRewriteRequest( sourceBucket=source_resource.storage_url.bucket_name, sourceObject=source_resource.storage_url.object_name, destinationBucket=destination_resource.storage_url. bucket_name, destinationObject=destination_resource.storage_url. object_name, object=destination_metadata, sourceGeneration=source_generation, ifGenerationMatch=copy_util.get_generation_match_value( request_config), ifMetagenerationMatch=request_config. precondition_metageneration_match, destinationPredefinedAcl=predefined_acl, rewriteToken=resume_rewrite_token, maxBytesRewrittenPerCall=max_bytes_per_call) encryption_key = getattr(request_config.resource_args, 'encryption_key', None) if encryption_key and encryption_key.type == encryption_util.KeyType.CMEK: # This key is also provided in destination_metadata.kmsKeyName by # update_object_metadata_from_request_config. This has no effect on # the copy object request, which references the field below, and is a # side-effect of logic required for uploads and compose operations. request.destinationKmsKeyName = encryption_key.key rewrite_response = self.client.objects.Rewrite(request) processed_bytes = rewrite_response.totalBytesRewritten if progress_callback: progress_callback(processed_bytes) if rewrite_response.done: break if not resume_rewrite_token: resume_rewrite_token = rewrite_response.rewriteToken if source_resource.size >= scaled_integer.ParseInteger( properties.VALUES.storage.resumable_threshold.Get( )): tracker_file_util.write_rewrite_tracker_file( tracker_file_path, rewrite_parameters_hash, rewrite_response.rewriteToken) tracker_file_util.delete_tracker_file(tracker_file_path) return gcs_metadata_util.get_object_resource_from_metadata( rewrite_response.resource)