def testWriteComponentToParallelUploadTrackerFile(self): tracker_file_lock = parallelism_framework_util.CreateLock() fpath = self.CreateTempFile(file_name='foo') random_prefix = '123' enc_key = '456' objects = [ObjectFromTracker('obj1', '42'), ObjectFromTracker('obj2', '314159')] WriteParallelUploadTrackerFile(fpath, random_prefix, objects, encryption_key_sha256=enc_key) new_object = ObjectFromTracker('obj3', '43') try: WriteComponentToParallelUploadTrackerFile( fpath, tracker_file_lock, new_object, self.logger, encryption_key_sha256=None) self.fail('Expected CommandException due to different encryption key') except CommandException as e: self.assertIn('does not match encryption key', str(e)) WriteComponentToParallelUploadTrackerFile( fpath, tracker_file_lock, new_object, self.logger, encryption_key_sha256='456') (actual_key, actual_prefix, actual_objects) = ReadParallelUploadTrackerFile( fpath, self.logger) self.assertEqual(enc_key, actual_key) self.assertEqual(random_prefix, actual_prefix) self.assertEqual(objects + [new_object], actual_objects)
def testFilterExistingComponentsVersioned(self): """Tests upload with versionined parallel components.""" mock_api = MockCloudApi() bucket_name = self.MakeTempName('bucket') mock_api.MockCreateVersionedBucket(bucket_name) # dst_obj_metadata used for passing content-type. empty_object = apitools_messages.Object() tracker_file = self.CreateTempFile(file_name='foo', contents=b'asdf') tracker_file_lock = parallelism_framework_util.CreateLock() # Already uploaded, contents still match, component still used. fpath_uploaded_correctly = self.CreateTempFile(file_name='foo1', contents=b'1') fpath_uploaded_correctly_url = StorageUrlFromString( str(fpath_uploaded_correctly)) with open(fpath_uploaded_correctly, 'rb') as f_in: fpath_uploaded_correctly_md5 = _CalculateB64EncodedMd5FromContents(f_in) object_uploaded_correctly = mock_api.MockCreateObjectWithMetadata( apitools_messages.Object(bucket=bucket_name, name=fpath_uploaded_correctly, md5Hash=fpath_uploaded_correctly_md5), contents=b'1') object_uploaded_correctly_url = StorageUrlFromString( '%s://%s/%s#%s' % (self.default_provider, bucket_name, fpath_uploaded_correctly, object_uploaded_correctly.generation)) args_uploaded_correctly = PerformParallelUploadFileToObjectArgs( fpath_uploaded_correctly, 0, 1, fpath_uploaded_correctly_url, object_uploaded_correctly_url, object_uploaded_correctly.generation, empty_object, tracker_file, tracker_file_lock, None, False) # Duplicate object name in tracker file, but uploaded correctly. fpath_duplicate = fpath_uploaded_correctly fpath_duplicate_url = StorageUrlFromString(str(fpath_duplicate)) duplicate_uploaded_correctly = mock_api.MockCreateObjectWithMetadata( apitools_messages.Object(bucket=bucket_name, name=fpath_duplicate, md5Hash=fpath_uploaded_correctly_md5), contents=b'1') duplicate_uploaded_correctly_url = StorageUrlFromString( '%s://%s/%s#%s' % (self.default_provider, bucket_name, fpath_uploaded_correctly, duplicate_uploaded_correctly.generation)) args_duplicate = PerformParallelUploadFileToObjectArgs( fpath_duplicate, 0, 1, fpath_duplicate_url, duplicate_uploaded_correctly_url, duplicate_uploaded_correctly.generation, empty_object, tracker_file, tracker_file_lock, None, False) # Already uploaded, but contents no longer match. fpath_wrong_contents = self.CreateTempFile(file_name='foo4', contents=b'4') fpath_wrong_contents_url = StorageUrlFromString(str(fpath_wrong_contents)) with open(self.CreateTempFile(contents=b'_'), 'rb') as f_in: fpath_wrong_contents_md5 = _CalculateB64EncodedMd5FromContents(f_in) object_wrong_contents = mock_api.MockCreateObjectWithMetadata( apitools_messages.Object(bucket=bucket_name, name=fpath_wrong_contents, md5Hash=fpath_wrong_contents_md5), contents=b'_') wrong_contents_url = StorageUrlFromString( '%s://%s/%s#%s' % (self.default_provider, bucket_name, fpath_wrong_contents, object_wrong_contents.generation)) args_wrong_contents = PerformParallelUploadFileToObjectArgs( fpath_wrong_contents, 0, 1, fpath_wrong_contents_url, wrong_contents_url, '', empty_object, tracker_file, tracker_file_lock, None, False) dst_args = { fpath_uploaded_correctly: args_uploaded_correctly, fpath_wrong_contents: args_wrong_contents } existing_components = [ ObjectFromTracker(fpath_uploaded_correctly, object_uploaded_correctly_url.generation), ObjectFromTracker(fpath_duplicate, duplicate_uploaded_correctly_url.generation), ObjectFromTracker(fpath_wrong_contents, wrong_contents_url.generation) ] bucket_url = StorageUrlFromString('%s://%s' % (self.default_provider, bucket_name)) (components_to_upload, uploaded_components, existing_objects_to_delete) = (FilterExistingComponents( dst_args, existing_components, bucket_url, mock_api)) uploaded_components = [i[0] for i in uploaded_components] self.assertEqual([args_wrong_contents], components_to_upload) self.assertEqual(args_uploaded_correctly.dst_url.url_string, uploaded_components[0].url_string) expected_to_delete = [(args_wrong_contents.dst_url.object_name, args_wrong_contents.dst_url.generation), (args_duplicate.dst_url.object_name, args_duplicate.dst_url.generation)] for uri in existing_objects_to_delete: self.assertTrue((uri.object_name, uri.generation) in expected_to_delete) self.assertEqual(len(expected_to_delete), len(existing_objects_to_delete))
def testFilterExistingComponentsNonVersioned(self): """Tests upload with a variety of component states.""" mock_api = MockCloudApi() bucket_name = self.MakeTempName('bucket') tracker_file = self.CreateTempFile(file_name='foo', contents=b'asdf') tracker_file_lock = parallelism_framework_util.CreateLock() # dst_obj_metadata used for passing content-type. empty_object = apitools_messages.Object() # Already uploaded, contents still match, component still used. fpath_uploaded_correctly = self.CreateTempFile(file_name='foo1', contents=b'1') fpath_uploaded_correctly_url = StorageUrlFromString( str(fpath_uploaded_correctly)) object_uploaded_correctly_url = StorageUrlFromString( '%s://%s/%s' % (self.default_provider, bucket_name, fpath_uploaded_correctly)) with open(fpath_uploaded_correctly, 'rb') as f_in: fpath_uploaded_correctly_md5 = _CalculateB64EncodedMd5FromContents(f_in) mock_api.MockCreateObjectWithMetadata(apitools_messages.Object( bucket=bucket_name, name=fpath_uploaded_correctly, md5Hash=fpath_uploaded_correctly_md5), contents=b'1') args_uploaded_correctly = PerformParallelUploadFileToObjectArgs( fpath_uploaded_correctly, 0, 1, fpath_uploaded_correctly_url, object_uploaded_correctly_url, '', empty_object, tracker_file, tracker_file_lock, None, False) # Not yet uploaded, but needed. fpath_not_uploaded = self.CreateTempFile(file_name='foo2', contents=b'2') fpath_not_uploaded_url = StorageUrlFromString(str(fpath_not_uploaded)) object_not_uploaded_url = StorageUrlFromString( '%s://%s/%s' % (self.default_provider, bucket_name, fpath_not_uploaded)) args_not_uploaded = PerformParallelUploadFileToObjectArgs( fpath_not_uploaded, 0, 1, fpath_not_uploaded_url, object_not_uploaded_url, '', empty_object, tracker_file, tracker_file_lock, None, False) # Already uploaded, but contents no longer match. Even though the contents # differ, we don't delete this since the bucket is not versioned and it # will be overwritten anyway. fpath_wrong_contents = self.CreateTempFile(file_name='foo4', contents=b'4') fpath_wrong_contents_url = StorageUrlFromString(str(fpath_wrong_contents)) object_wrong_contents_url = StorageUrlFromString( '%s://%s/%s' % (self.default_provider, bucket_name, fpath_wrong_contents)) with open(self.CreateTempFile(contents=b'_'), 'rb') as f_in: fpath_wrong_contents_md5 = _CalculateB64EncodedMd5FromContents(f_in) mock_api.MockCreateObjectWithMetadata(apitools_messages.Object( bucket=bucket_name, name=fpath_wrong_contents, md5Hash=fpath_wrong_contents_md5), contents=b'1') args_wrong_contents = PerformParallelUploadFileToObjectArgs( fpath_wrong_contents, 0, 1, fpath_wrong_contents_url, object_wrong_contents_url, '', empty_object, tracker_file, tracker_file_lock, None, False) # Exists in tracker file, but component object no longer exists. fpath_remote_deleted = self.CreateTempFile(file_name='foo5', contents=b'5') fpath_remote_deleted_url = StorageUrlFromString(str(fpath_remote_deleted)) args_remote_deleted = PerformParallelUploadFileToObjectArgs( fpath_remote_deleted, 0, 1, fpath_remote_deleted_url, '', '', empty_object, tracker_file, tracker_file_lock, None, False) # Exists in tracker file and already uploaded, but no longer needed. fpath_no_longer_used = self.CreateTempFile(file_name='foo6', contents=b'6') with open(fpath_no_longer_used, 'rb') as f_in: file_md5 = _CalculateB64EncodedMd5FromContents(f_in) mock_api.MockCreateObjectWithMetadata(apitools_messages.Object( bucket=bucket_name, name='foo6', md5Hash=file_md5), contents=b'6') dst_args = { fpath_uploaded_correctly: args_uploaded_correctly, fpath_not_uploaded: args_not_uploaded, fpath_wrong_contents: args_wrong_contents, fpath_remote_deleted: args_remote_deleted } existing_components = [ ObjectFromTracker(fpath_uploaded_correctly, ''), ObjectFromTracker(fpath_wrong_contents, ''), ObjectFromTracker(fpath_remote_deleted, ''), ObjectFromTracker(fpath_no_longer_used, '') ] bucket_url = StorageUrlFromString('%s://%s' % (self.default_provider, bucket_name)) (components_to_upload, uploaded_components, existing_objects_to_delete) = (FilterExistingComponents( dst_args, existing_components, bucket_url, mock_api)) uploaded_components = [i[0] for i in uploaded_components] for arg in [args_not_uploaded, args_wrong_contents, args_remote_deleted]: self.assertTrue(arg in components_to_upload) self.assertEqual(1, len(uploaded_components)) self.assertEqual(args_uploaded_correctly.dst_url.url_string, uploaded_components[0].url_string) self.assertEqual(1, len(existing_objects_to_delete)) no_longer_used_url = StorageUrlFromString( '%s://%s/%s' % (self.default_provider, bucket_name, fpath_no_longer_used)) self.assertEqual(no_longer_used_url.url_string, existing_objects_to_delete[0].url_string)
def __init__(self, src_url, src_obj_size, gsutil_api, compressed_encoding=False, progress_callback=None, download_chunk_size=_DEFAULT_DOWNLOAD_CHUNK_SIZE, decryption_key=None): """Initializes the daisy chain wrapper. Args: src_url: Source CloudUrl to copy from. src_obj_size: Size of source object. gsutil_api: gsutil Cloud API to use for the copy. compressed_encoding: If true, source object has content-encoding: gzip. progress_callback: Optional callback function for progress notifications for the download thread. Receives calls with arguments (bytes_transferred, total_size). download_chunk_size: Integer number of bytes to download per GetObjectMedia request. This is the upper bound of bytes that may be unnecessarily downloaded if there is a break in the resumable upload. decryption_key: Base64-encoded decryption key for the source object, if any. Raises: Exception: if the download thread doesn't start within 60 seconds """ # Current read position for the upload file pointer. self.position = 0 self.buffer = collections.deque() self.bytes_buffered = 0 # Maximum amount of bytes in memory at a time. self.max_buffer_size = 1024 * 1024 # 1 MiB self._download_chunk_size = download_chunk_size # We save one buffer's worth of data as a special case for boto, # which seeks back one buffer and rereads to compute hashes. This is # unnecessary because we can just compare cloud hash digests at the end, # but it allows this to work without modfiying boto. self.last_position = 0 self.last_data = None # Protects buffer, position, bytes_buffered, last_position, and last_data. self.lock = parallelism_framework_util.CreateLock() # Protects download_exception. self.download_exception_lock = parallelism_framework_util.CreateLock() self.src_obj_size = src_obj_size self.src_url = src_url self.compressed_encoding = compressed_encoding self.decryption_tuple = CryptoKeyWrapperFromKey(decryption_key) # This is safe to use the upload and download thread because the download # thread calls only GetObjectMedia, which creates a new HTTP connection # independent of gsutil_api. Thus, it will not share an HTTP connection # with the upload. self.gsutil_api = gsutil_api # If self.download_thread dies due to an exception, it is saved here so # that it can also be raised in the upload thread. self.download_exception = None self.download_thread = None self.progress_callback = progress_callback self.download_started = threading.Event() self.stop_download = threading.Event() self.StartDownloadThread(progress_callback=self.progress_callback) if not self.download_started.wait(60): raise Exception( 'Could not start download thread after 60 seconds.')