Exemple #1
0
  def testWriteComponentToParallelUploadTrackerFile(self):
    tracker_file_lock = parallelism_framework_util.CreateLock()
    fpath = self.CreateTempFile(file_name='foo')
    random_prefix = '123'
    enc_key = '456'
    objects = [ObjectFromTracker('obj1', '42'),
               ObjectFromTracker('obj2', '314159')]
    WriteParallelUploadTrackerFile(fpath, random_prefix, objects,
                                   encryption_key_sha256=enc_key)
    new_object = ObjectFromTracker('obj3', '43')
    try:
      WriteComponentToParallelUploadTrackerFile(
          fpath, tracker_file_lock, new_object, self.logger,
          encryption_key_sha256=None)
      self.fail('Expected CommandException due to different encryption key')
    except CommandException as e:
      self.assertIn('does not match encryption key', str(e))

    WriteComponentToParallelUploadTrackerFile(
        fpath, tracker_file_lock, new_object, self.logger,
        encryption_key_sha256='456')

    (actual_key, actual_prefix, actual_objects) = ReadParallelUploadTrackerFile(
        fpath, self.logger)
    self.assertEqual(enc_key, actual_key)
    self.assertEqual(random_prefix, actual_prefix)
    self.assertEqual(objects + [new_object], actual_objects)
  def testFilterExistingComponentsVersioned(self):
    """Tests upload with versionined parallel components."""

    mock_api = MockCloudApi()
    bucket_name = self.MakeTempName('bucket')
    mock_api.MockCreateVersionedBucket(bucket_name)

    # dst_obj_metadata used for passing content-type.
    empty_object = apitools_messages.Object()

    tracker_file = self.CreateTempFile(file_name='foo', contents=b'asdf')
    tracker_file_lock = parallelism_framework_util.CreateLock()

    # Already uploaded, contents still match, component still used.
    fpath_uploaded_correctly = self.CreateTempFile(file_name='foo1',
                                                   contents=b'1')
    fpath_uploaded_correctly_url = StorageUrlFromString(
        str(fpath_uploaded_correctly))
    with open(fpath_uploaded_correctly, 'rb') as f_in:
      fpath_uploaded_correctly_md5 = _CalculateB64EncodedMd5FromContents(f_in)
    object_uploaded_correctly = mock_api.MockCreateObjectWithMetadata(
        apitools_messages.Object(bucket=bucket_name,
                                 name=fpath_uploaded_correctly,
                                 md5Hash=fpath_uploaded_correctly_md5),
        contents=b'1')
    object_uploaded_correctly_url = StorageUrlFromString(
        '%s://%s/%s#%s' %
        (self.default_provider, bucket_name, fpath_uploaded_correctly,
         object_uploaded_correctly.generation))
    args_uploaded_correctly = PerformParallelUploadFileToObjectArgs(
        fpath_uploaded_correctly, 0, 1, fpath_uploaded_correctly_url,
        object_uploaded_correctly_url, object_uploaded_correctly.generation,
        empty_object, tracker_file, tracker_file_lock, None, False)

    # Duplicate object name in tracker file, but uploaded correctly.
    fpath_duplicate = fpath_uploaded_correctly
    fpath_duplicate_url = StorageUrlFromString(str(fpath_duplicate))
    duplicate_uploaded_correctly = mock_api.MockCreateObjectWithMetadata(
        apitools_messages.Object(bucket=bucket_name,
                                 name=fpath_duplicate,
                                 md5Hash=fpath_uploaded_correctly_md5),
        contents=b'1')
    duplicate_uploaded_correctly_url = StorageUrlFromString(
        '%s://%s/%s#%s' %
        (self.default_provider, bucket_name, fpath_uploaded_correctly,
         duplicate_uploaded_correctly.generation))
    args_duplicate = PerformParallelUploadFileToObjectArgs(
        fpath_duplicate, 0, 1, fpath_duplicate_url,
        duplicate_uploaded_correctly_url,
        duplicate_uploaded_correctly.generation, empty_object, tracker_file,
        tracker_file_lock, None, False)

    # Already uploaded, but contents no longer match.
    fpath_wrong_contents = self.CreateTempFile(file_name='foo4', contents=b'4')
    fpath_wrong_contents_url = StorageUrlFromString(str(fpath_wrong_contents))
    with open(self.CreateTempFile(contents=b'_'), 'rb') as f_in:
      fpath_wrong_contents_md5 = _CalculateB64EncodedMd5FromContents(f_in)
    object_wrong_contents = mock_api.MockCreateObjectWithMetadata(
        apitools_messages.Object(bucket=bucket_name,
                                 name=fpath_wrong_contents,
                                 md5Hash=fpath_wrong_contents_md5),
        contents=b'_')
    wrong_contents_url = StorageUrlFromString(
        '%s://%s/%s#%s' %
        (self.default_provider, bucket_name, fpath_wrong_contents,
         object_wrong_contents.generation))
    args_wrong_contents = PerformParallelUploadFileToObjectArgs(
        fpath_wrong_contents, 0, 1, fpath_wrong_contents_url,
        wrong_contents_url, '', empty_object, tracker_file, tracker_file_lock,
        None, False)

    dst_args = {
        fpath_uploaded_correctly: args_uploaded_correctly,
        fpath_wrong_contents: args_wrong_contents
    }

    existing_components = [
        ObjectFromTracker(fpath_uploaded_correctly,
                          object_uploaded_correctly_url.generation),
        ObjectFromTracker(fpath_duplicate,
                          duplicate_uploaded_correctly_url.generation),
        ObjectFromTracker(fpath_wrong_contents, wrong_contents_url.generation)
    ]

    bucket_url = StorageUrlFromString('%s://%s' %
                                      (self.default_provider, bucket_name))

    (components_to_upload, uploaded_components,
     existing_objects_to_delete) = (FilterExistingComponents(
         dst_args, existing_components, bucket_url, mock_api))
    uploaded_components = [i[0] for i in uploaded_components]
    self.assertEqual([args_wrong_contents], components_to_upload)
    self.assertEqual(args_uploaded_correctly.dst_url.url_string,
                     uploaded_components[0].url_string)
    expected_to_delete = [(args_wrong_contents.dst_url.object_name,
                           args_wrong_contents.dst_url.generation),
                          (args_duplicate.dst_url.object_name,
                           args_duplicate.dst_url.generation)]
    for uri in existing_objects_to_delete:
      self.assertTrue((uri.object_name, uri.generation) in expected_to_delete)
    self.assertEqual(len(expected_to_delete), len(existing_objects_to_delete))
  def testFilterExistingComponentsNonVersioned(self):
    """Tests upload with a variety of component states."""
    mock_api = MockCloudApi()
    bucket_name = self.MakeTempName('bucket')
    tracker_file = self.CreateTempFile(file_name='foo', contents=b'asdf')
    tracker_file_lock = parallelism_framework_util.CreateLock()

    # dst_obj_metadata used for passing content-type.
    empty_object = apitools_messages.Object()

    # Already uploaded, contents still match, component still used.
    fpath_uploaded_correctly = self.CreateTempFile(file_name='foo1',
                                                   contents=b'1')
    fpath_uploaded_correctly_url = StorageUrlFromString(
        str(fpath_uploaded_correctly))
    object_uploaded_correctly_url = StorageUrlFromString(
        '%s://%s/%s' %
        (self.default_provider, bucket_name, fpath_uploaded_correctly))
    with open(fpath_uploaded_correctly, 'rb') as f_in:
      fpath_uploaded_correctly_md5 = _CalculateB64EncodedMd5FromContents(f_in)
    mock_api.MockCreateObjectWithMetadata(apitools_messages.Object(
        bucket=bucket_name,
        name=fpath_uploaded_correctly,
        md5Hash=fpath_uploaded_correctly_md5),
                                          contents=b'1')

    args_uploaded_correctly = PerformParallelUploadFileToObjectArgs(
        fpath_uploaded_correctly, 0, 1, fpath_uploaded_correctly_url,
        object_uploaded_correctly_url, '', empty_object, tracker_file,
        tracker_file_lock, None, False)

    # Not yet uploaded, but needed.
    fpath_not_uploaded = self.CreateTempFile(file_name='foo2', contents=b'2')
    fpath_not_uploaded_url = StorageUrlFromString(str(fpath_not_uploaded))
    object_not_uploaded_url = StorageUrlFromString(
        '%s://%s/%s' % (self.default_provider, bucket_name, fpath_not_uploaded))
    args_not_uploaded = PerformParallelUploadFileToObjectArgs(
        fpath_not_uploaded, 0, 1, fpath_not_uploaded_url,
        object_not_uploaded_url, '', empty_object, tracker_file,
        tracker_file_lock, None, False)

    # Already uploaded, but contents no longer match. Even though the contents
    # differ, we don't delete this since the bucket is not versioned and it
    # will be overwritten anyway.
    fpath_wrong_contents = self.CreateTempFile(file_name='foo4', contents=b'4')
    fpath_wrong_contents_url = StorageUrlFromString(str(fpath_wrong_contents))
    object_wrong_contents_url = StorageUrlFromString(
        '%s://%s/%s' %
        (self.default_provider, bucket_name, fpath_wrong_contents))
    with open(self.CreateTempFile(contents=b'_'), 'rb') as f_in:
      fpath_wrong_contents_md5 = _CalculateB64EncodedMd5FromContents(f_in)
    mock_api.MockCreateObjectWithMetadata(apitools_messages.Object(
        bucket=bucket_name,
        name=fpath_wrong_contents,
        md5Hash=fpath_wrong_contents_md5),
                                          contents=b'1')

    args_wrong_contents = PerformParallelUploadFileToObjectArgs(
        fpath_wrong_contents, 0, 1, fpath_wrong_contents_url,
        object_wrong_contents_url, '', empty_object, tracker_file,
        tracker_file_lock, None, False)

    # Exists in tracker file, but component object no longer exists.
    fpath_remote_deleted = self.CreateTempFile(file_name='foo5', contents=b'5')
    fpath_remote_deleted_url = StorageUrlFromString(str(fpath_remote_deleted))
    args_remote_deleted = PerformParallelUploadFileToObjectArgs(
        fpath_remote_deleted, 0, 1, fpath_remote_deleted_url, '', '',
        empty_object, tracker_file, tracker_file_lock, None, False)

    # Exists in tracker file and already uploaded, but no longer needed.
    fpath_no_longer_used = self.CreateTempFile(file_name='foo6', contents=b'6')
    with open(fpath_no_longer_used, 'rb') as f_in:
      file_md5 = _CalculateB64EncodedMd5FromContents(f_in)
    mock_api.MockCreateObjectWithMetadata(apitools_messages.Object(
        bucket=bucket_name, name='foo6', md5Hash=file_md5),
                                          contents=b'6')

    dst_args = {
        fpath_uploaded_correctly: args_uploaded_correctly,
        fpath_not_uploaded: args_not_uploaded,
        fpath_wrong_contents: args_wrong_contents,
        fpath_remote_deleted: args_remote_deleted
    }

    existing_components = [
        ObjectFromTracker(fpath_uploaded_correctly, ''),
        ObjectFromTracker(fpath_wrong_contents, ''),
        ObjectFromTracker(fpath_remote_deleted, ''),
        ObjectFromTracker(fpath_no_longer_used, '')
    ]

    bucket_url = StorageUrlFromString('%s://%s' %
                                      (self.default_provider, bucket_name))

    (components_to_upload, uploaded_components,
     existing_objects_to_delete) = (FilterExistingComponents(
         dst_args, existing_components, bucket_url, mock_api))
    uploaded_components = [i[0] for i in uploaded_components]
    for arg in [args_not_uploaded, args_wrong_contents, args_remote_deleted]:
      self.assertTrue(arg in components_to_upload)
    self.assertEqual(1, len(uploaded_components))
    self.assertEqual(args_uploaded_correctly.dst_url.url_string,
                     uploaded_components[0].url_string)
    self.assertEqual(1, len(existing_objects_to_delete))
    no_longer_used_url = StorageUrlFromString(
        '%s://%s/%s' %
        (self.default_provider, bucket_name, fpath_no_longer_used))
    self.assertEqual(no_longer_used_url.url_string,
                     existing_objects_to_delete[0].url_string)
Exemple #4
0
    def __init__(self,
                 src_url,
                 src_obj_size,
                 gsutil_api,
                 compressed_encoding=False,
                 progress_callback=None,
                 download_chunk_size=_DEFAULT_DOWNLOAD_CHUNK_SIZE,
                 decryption_key=None):
        """Initializes the daisy chain wrapper.

    Args:
      src_url: Source CloudUrl to copy from.
      src_obj_size: Size of source object.
      gsutil_api: gsutil Cloud API to use for the copy.
      compressed_encoding: If true, source object has content-encoding: gzip.
      progress_callback: Optional callback function for progress notifications
          for the download thread. Receives calls with arguments
          (bytes_transferred, total_size).
      download_chunk_size: Integer number of bytes to download per
          GetObjectMedia request. This is the upper bound of bytes that may be
          unnecessarily downloaded if there is a break in the resumable upload.
      decryption_key: Base64-encoded decryption key for the source object,
          if any.
    Raises:
      Exception: if the download thread doesn't start within 60 seconds
    """
        # Current read position for the upload file pointer.
        self.position = 0
        self.buffer = collections.deque()

        self.bytes_buffered = 0
        # Maximum amount of bytes in memory at a time.
        self.max_buffer_size = 1024 * 1024  # 1 MiB

        self._download_chunk_size = download_chunk_size

        # We save one buffer's worth of data as a special case for boto,
        # which seeks back one buffer and rereads to compute hashes. This is
        # unnecessary because we can just compare cloud hash digests at the end,
        # but it allows this to work without modfiying boto.
        self.last_position = 0
        self.last_data = None

        # Protects buffer, position, bytes_buffered, last_position, and last_data.
        self.lock = parallelism_framework_util.CreateLock()

        # Protects download_exception.
        self.download_exception_lock = parallelism_framework_util.CreateLock()

        self.src_obj_size = src_obj_size
        self.src_url = src_url
        self.compressed_encoding = compressed_encoding
        self.decryption_tuple = CryptoKeyWrapperFromKey(decryption_key)

        # This is safe to use the upload and download thread because the download
        # thread calls only GetObjectMedia, which creates a new HTTP connection
        # independent of gsutil_api. Thus, it will not share an HTTP connection
        # with the upload.
        self.gsutil_api = gsutil_api

        # If self.download_thread dies due to an exception, it is saved here so
        # that it can also be raised in the upload thread.
        self.download_exception = None
        self.download_thread = None
        self.progress_callback = progress_callback
        self.download_started = threading.Event()
        self.stop_download = threading.Event()
        self.StartDownloadThread(progress_callback=self.progress_callback)
        if not self.download_started.wait(60):
            raise Exception(
                'Could not start download thread after 60 seconds.')