예제 #1
0
class StorageBase(ReprMixIn, metaclass=ABCMeta):

    _CHECKSUM_KEY = 'checksum'
    _CREATED_KEY = 'created'
    _MODIFIED_KEY = 'modified'
    _HMAC_KEY = 'hmac'
    _METADATA_VERSION_KEY = 'metadata_version'
    _OBJECT_SIZE_KEY = 'object_size'
    _SIZE_KEY = 'size'
    _TRANSFORMS_KEY = 'transforms'

    _META_SUFFIX = '.meta'

    def __init__(self, *, config: Config, name: str, storage_id: int, module_configuration: ConfigDict) -> None:
        self._name = name
        self._storage_id = storage_id
        self._active_transforms: List[TransformBase] = []

        active_transforms = Config.get_from_dict(module_configuration, 'activeTransforms', None, types=list)
        if active_transforms is not None:
            for transform in active_transforms:
                self._active_transforms.append(TransformFactory.get_by_name(transform))
            logger.info('Active transforms for storage {}: {}.'.format(
                name, ', '.join(
                    ['{} ({})'.format(transform.name, transform.module) for transform in self._active_transforms])))

        simultaneous_writes = Config.get_from_dict(module_configuration, 'simultaneousWrites', types=int)
        simultaneous_reads = Config.get_from_dict(module_configuration, 'simultaneousReads', types=int)
        simultaneous_removals = Config.get_from_dict(module_configuration, 'simultaneousRemovals', types=int)
        bandwidth_read = Config.get_from_dict(module_configuration, 'bandwidthRead', types=int)
        bandwidth_write = Config.get_from_dict(module_configuration, 'bandwidthWrite', types=int)

        self._consistency_check_writes = Config.get_from_dict(
            module_configuration, 'consistencyCheckWrites', False, types=bool)

        hmac_key_encoded = Config.get_from_dict(module_configuration, 'hmac.key', None, types=str)
        hmac_key: Optional[bytes] = None
        if hmac_key_encoded is None:
            hmac_password = Config.get_from_dict(module_configuration, 'hmac.password', None, types=str)
            if hmac_password is not None:
                hmac_kdf_salt = base64.b64decode(Config.get_from_dict(module_configuration, 'hmac.kdfSalt', types=str))
                hmac_kdf_iterations = Config.get_from_dict(module_configuration, 'hmac.kdfIterations', types=int)
                hmac_key = derive_key(
                    salt=hmac_kdf_salt, iterations=hmac_kdf_iterations, key_length=32, password=hmac_password)
        else:
            hmac_key = base64.b64decode(hmac_key_encoded)
        self._dict_hmac: Optional[DictHMAC] = None
        if hmac_key is not None:
            logger.info('Enabling HMAC object metadata integrity protection for storage {}.'.format(name))
            self._dict_hmac = DictHMAC(hmac_key=self._HMAC_KEY, secret_key=hmac_key)

        self.read_throttling = TokenBucket()
        self.read_throttling.set_rate(bandwidth_read)  # 0 disables throttling
        self.write_throttling = TokenBucket()
        self.write_throttling.set_rate(bandwidth_write)  # 0 disables throttling

        self._read_executor = JobExecutor(name='Storage-Read', workers=simultaneous_reads, blocking_submit=False)
        self._write_executor = JobExecutor(name='Storage-Write', workers=simultaneous_writes, blocking_submit=True)
        self._remove_executor = JobExecutor(name='Storage-Remove', workers=simultaneous_removals, blocking_submit=True)

    @property
    def name(self) -> str:
        return self._name

    @property
    def storage_id(self) -> int:
        return self._storage_id

    def _build_metadata(self,
                        *,
                        size: int,
                        object_size: int,
                        transforms_metadata: List[Dict] = None,
                        checksum: str = None) -> Tuple[Dict, bytes]:

        timestamp = datetime.datetime.utcnow().isoformat(timespec='microseconds')
        metadata: Dict = {
            self._CREATED_KEY: timestamp,
            self._METADATA_VERSION_KEY: str(VERSIONS.object_metadata.current),
            self._MODIFIED_KEY: timestamp,
            self._OBJECT_SIZE_KEY: object_size,
            self._SIZE_KEY: size,
        }

        if checksum:
            metadata[self._CHECKSUM_KEY] = checksum

        if transforms_metadata:
            metadata[self._TRANSFORMS_KEY] = transforms_metadata

        if self._dict_hmac:
            self._dict_hmac.add_digest(metadata)

        return metadata, json.dumps(metadata, separators=(',', ':')).encode('utf-8')

    def _decode_metadata(self, *, metadata_json: bytes, key: str, data_length: int) -> Dict:
        metadata = json.loads(metadata_json.decode('utf-8'))

        if self._dict_hmac:
            self._dict_hmac.verify_digest(metadata)

        # We currently support only one object metadata version
        if self._METADATA_VERSION_KEY not in metadata:
            raise KeyError('Required object metadata key {} is missing for object {}.'.format(
                self._METADATA_VERSION_KEY, key))
        version_obj = semantic_version.Version(metadata[self._METADATA_VERSION_KEY])
        if version_obj not in VERSIONS.object_metadata.supported:
            raise ValueError('Unsupported object metadata version: "{}".'.format(str(version_obj)))

        for required_key in [self._CREATED_KEY, self._MODIFIED_KEY, self._OBJECT_SIZE_KEY, self._SIZE_KEY]:
            if required_key not in metadata:
                raise KeyError('Required object metadata key {} is missing for object {}.'.format(required_key, key))

        if data_length != metadata[self._OBJECT_SIZE_KEY]:
            raise ValueError('Length mismatch for object {}. Expected: {}, got: {}.'.format(
                key, metadata[self._OBJECT_SIZE_KEY], data_length))

        return metadata

    def _check_write(self, *, key: str, metadata_key: str, data_expected: bytes) -> None:
        data_actual = self._read_object(key)
        metadata_actual_json = self._read_object(metadata_key)

        # Return value is ignored
        self._decode_metadata(metadata_json=metadata_actual_json, key=key, data_length=len(data_actual))

        # Comparing encapsulated data here
        if data_expected != data_actual:
            raise ValueError('Written and read data of {} differ.'.format(key))

    def _write(self, block: DereferencedBlock, data: bytes) -> DereferencedBlock:
        data, transforms_metadata = self._encapsulate(data)

        metadata, metadata_json = self._build_metadata(
            size=block.size, object_size=len(data), checksum=block.checksum, transforms_metadata=transforms_metadata)

        key = block.uid.storage_object_to_path()
        metadata_key = key + self._META_SUFFIX

        time.sleep(self.write_throttling.consume(len(data) + len(metadata_json)))
        t1 = time.time()
        try:
            self._write_object(key, data)
            self._write_object(metadata_key, metadata_json)
        except:
            try:
                self._rm_object(key)
                self._rm_object(metadata_key)
            except FileNotFoundError:
                pass
            raise
        t2 = time.time()

        logger.debug('{} wrote data of uid {} in {:.2f}s'.format(threading.current_thread().name, block.uid, t2 - t1))

        if self._consistency_check_writes:
            try:
                self._check_write(key=key, metadata_key=metadata_key, data_expected=data)
            except (KeyError, ValueError) as exception:
                raise InvalidBlockException('Check write of block {} (UID {}) failed.'.format(block.id, block.uid),
                                            block) from exception

        return block

    def write_block_async(self, block: Union[DereferencedBlock, Block], data: bytes) -> None:
        block_deref = block.deref() if isinstance(block, Block) else block

        def job():
            return self._write(block_deref, data)

        self._write_executor.submit(job)

    def write_block(self, block: Union[DereferencedBlock, Block], data: bytes) -> None:
        block_deref = block.deref() if isinstance(block, Block) else block
        self._write(block_deref, data)

    def write_get_completed(self, timeout: int = None) -> Iterator[Union[DereferencedBlock, BaseException]]:
        return self._write_executor.get_completed(timeout=timeout)

    def _read(self, block: DereferencedBlock, metadata_only: bool) -> Tuple[DereferencedBlock, Optional[bytes], Dict]:
        key = block.uid.storage_object_to_path()
        metadata_key = key + self._META_SUFFIX
        data: Optional[bytes] = None
        try:
            t1 = time.time()
            if not metadata_only:
                data = self._read_object(key)
                data_length = len(data)
            else:
                data_length = self._read_object_length(key)
            metadata_json = self._read_object(metadata_key)
            time.sleep(self.read_throttling.consume(len(data) if data else 0 + len(metadata_json)))
            t2 = time.time()
        except FileNotFoundError as exception:
            raise InvalidBlockException(
                'Object metadata or data of block {} (UID{}) not found.'.format(block.id, block.uid),
                block) from exception

        try:
            metadata = self._decode_metadata(metadata_json=metadata_json, key=key, data_length=data_length)
        except (KeyError, ValueError) as exception:
            raise InvalidBlockException('Object metadata of block {} (UID{}) is invalid.'.format(block.id, block.uid),
                                        block) from exception

        if self._CHECKSUM_KEY not in metadata:
            raise InvalidBlockException(
                'Required object metadata key {} is missing for block {} (UID {}).'.format(
                    self._CHECKSUM_KEY, block.id, block.uid), block)

        if not metadata_only and self._TRANSFORMS_KEY in metadata:
            data = self._decapsulate(data, metadata[self._TRANSFORMS_KEY])  # type: ignore

        logger.debug('{} read data of uid {} in {:.2f}s{}'.format(threading.current_thread().name, block.uid, t2 - t1,
                                                                  ' (metadata only)' if metadata_only else ''))

        return block, data, metadata

    def read_block_async(self, block: Block, metadata_only: bool = False) -> None:

        def job():
            return self._read(block.deref(), metadata_only)

        self._read_executor.submit(job)

    def read_block(self, block: Block, metadata_only: bool = False) -> Optional[bytes]:
        return self._read(block.deref(), metadata_only)[1]

    def read_get_completed(self,
                           timeout: int = None) -> Iterator[Union[Tuple[DereferencedBlock, bytes, Dict], BaseException]]:
        return self._read_executor.get_completed(timeout=timeout)

    def check_block_metadata(self, *, block: DereferencedBlock, data_length: Optional[int], metadata: Dict) -> None:
        # Existence of keys has already been checked in _decode_metadata() and _read()
        if metadata[self._SIZE_KEY] != block.size:
            raise ValueError(
                'Mismatch between recorded block size and data length in object metadata for block {} (UID {}). '
                'Expected: {}, got: {}.'.format(block.id, block.uid, block.size, metadata[self._SIZE_KEY]))

        if data_length and data_length != block.size:
            raise ValueError('Mismatch between recorded block size and actual data length for block {} (UID {}). '
                             'Expected: {}, got: {}.'.format(block.id, block.uid, block.size, data_length))

        if block.checksum != metadata[self._CHECKSUM_KEY]:
            raise ValueError(
                'Mismatch between recorded block checksum and checksum in object metadata for block {} (UID {}). '
                'Expected: {}, got: {}.'.format(
                    block.id,
                    block.uid,
                    cast(str, block.checksum)[:16],  # We know that block.checksum is set
                    metadata[self._CHECKSUM_KEY][:16]))

    def _rm_block(self, uid: BlockUid) -> BlockUid:
        key = uid.storage_object_to_path()
        metadata_key = key + self._META_SUFFIX
        try:
            self._rm_object(key)
        except FileNotFoundError as exception:
            raise BlockNotFoundError('Block UID {} not found on storage.'.format(str(uid)), uid) from exception
        finally:
            try:
                self._rm_object(metadata_key)
            except FileNotFoundError:
                pass
        return uid

    def rm_block_async(self, uid: BlockUid) -> None:

        def job():
            return self._rm_block(uid)

        self._remove_executor.submit(job)

    def rm_block(self, uid: BlockUid) -> None:
        self._rm_block(uid)

    def rm_get_completed(self, timeout: int = None) -> Iterator[Union[BlockUid, BaseException]]:
        return self._remove_executor.get_completed(timeout=timeout)

    def wait_rms_finished(self):
        self._remove_executor.wait_for_all()

    # def rm_many_blocks(self, uids: Union[Sequence[BlockUid], AbstractSet[BlockUid]]) -> List[BlockUid]:
    #     keys = [uid.storage_object_to_path() for uid in uids]
    #     metadata_keys = [key + self._META_SUFFIX for key in keys]
    #
    #     errors = self._rm_many_objects(keys)
    #     self._rm_many_objects(metadata_keys)
    #     return [cast(BlockUid, BlockUid.storage_path_to_object(error)) for error in errors]

    def list_blocks(self) -> Iterable[BlockUid]:
        keys = self._list_objects(BlockUid.storage_prefix())
        for key in keys:
            assert isinstance(key, str)
            if key.endswith(self._META_SUFFIX):
                continue
            try:
                yield cast(BlockUid, BlockUid.storage_path_to_object(key))
            except (RuntimeError, ValueError):
                # Ignore any keys which don't match our pattern to account for stray objects/files
                pass

    def list_versions(self) -> Iterable[VersionUid]:
        keys = self._list_objects(VersionUid.storage_prefix())
        for key in keys:
            assert isinstance(key, str)
            if key.endswith(self._META_SUFFIX):
                continue
            try:
                yield cast(VersionUid, VersionUid.storage_path_to_object(key))
            except (RuntimeError, ValueError):
                # Ignore any keys which don't match our pattern to account for stray objects/files
                pass

    def read_version(self, version_uid: VersionUid) -> str:
        key = version_uid.storage_object_to_path()
        metadata_key = key + self._META_SUFFIX
        data = self._read_object(key)
        metadata_json = self._read_object(metadata_key)

        metadata = self._decode_metadata(metadata_json=metadata_json, key=key, data_length=len(data))

        if self._TRANSFORMS_KEY in metadata:
            data = self._decapsulate(data, metadata[self._TRANSFORMS_KEY])

        if len(data) != metadata[self._SIZE_KEY]:
            raise ValueError('Length mismatch of original data for object {}. Expected: {}, got: {}.'.format(
                key, metadata[self._SIZE_KEY], len(data)))

        return data.decode('utf-8')

    def write_version(self, version_uid: VersionUid, data: str, overwrite: Optional[bool] = False) -> None:
        key = version_uid.storage_object_to_path()
        metadata_key = key + self._META_SUFFIX

        if not overwrite:
            try:
                self._read_object(key)
            except FileNotFoundError:
                pass
            else:
                raise FileExistsError('Version {} already exists in storage.'.format(version_uid.v_string))

        data_bytes = data.encode('utf-8')
        size = len(data_bytes)

        data_bytes, transforms_metadata = self._encapsulate(data_bytes)
        metadata, metadata_json = self._build_metadata(
            size=size, object_size=len(data_bytes), transforms_metadata=transforms_metadata)

        try:
            self._write_object(key, data_bytes)
            self._write_object(metadata_key, metadata_json)
        except:
            try:
                self._rm_object(key)
                self._rm_object(metadata_key)
            except FileNotFoundError:
                pass
            raise

        if self._consistency_check_writes:
            self._check_write(key=key, metadata_key=metadata_key, data_expected=data_bytes)

    def rm_version(self, version_uid: VersionUid) -> None:
        key = version_uid.storage_object_to_path()
        metadata_key = key + self._META_SUFFIX
        try:
            self._rm_object(key)
        finally:
            try:
                self._rm_object(metadata_key)
            except FileNotFoundError:
                pass

    def storage_stats(self) -> Tuple[int, int]:
        objects_count = 0
        objects_size = 0
        for key, size in cast(Iterable[Tuple[str, int]], self._list_objects(include_size=True)):
            objects_count += 1
            objects_size += size
        return objects_count, objects_size

    def _encapsulate(self, data: bytes) -> Tuple[bytes, List]:
        if self._active_transforms is not None:
            transforms_metadata = []
            for transform in self._active_transforms:
                data_encapsulated, materials = transform.encapsulate(data=data)
                if data_encapsulated:
                    transforms_metadata.append({
                        'name': transform.name,
                        'module': transform.module,
                        'materials': materials,
                    })
                    data = data_encapsulated
            return data, transforms_metadata
        else:
            return data, []

    def _decapsulate(self, data: bytes, transforms_metadata: Sequence[Dict]) -> bytes:
        for element in reversed(transforms_metadata):
            name = element['name']
            module = element['module']
            transform = TransformFactory.get_by_name(name)
            if transform:
                if module != transform.module:
                    raise ConfigurationError('Mismatch between object transform module and configured module for ' +
                                             '{} ({} != {})'.format(name, module, transform.module))

                data = transform.decapsulate(data=data, materials=element['materials'])
            else:
                raise IOError('Unknown transform {} in object metadata.'.format(name))
        return data

    def wait_writes_finished(self) -> None:
        self._write_executor.wait_for_all()

    def use_read_cache(self, enable: bool) -> bool:
        return False

    def close(self) -> None:
        self._read_executor.shutdown()
        self._write_executor.shutdown()
        self._remove_executor.shutdown()

    @abstractmethod
    def _write_object(self, key: str, data: bytes):
        raise NotImplementedError

    @abstractmethod
    def _read_object(self, key: str) -> bytes:
        raise NotImplementedError

    @abstractmethod
    def _read_object_length(self, key: str) -> int:
        raise NotImplementedError

    @abstractmethod
    def _rm_object(self, key: str) -> None:
        raise NotImplementedError

    @abstractmethod
    def _list_objects(self, prefix: str = None,
                      include_size: bool = False) -> Union[Iterable[str], Iterable[Tuple[str, int]]]:
        raise NotImplementedError
예제 #2
0
class DataBackend(metaclass=ABCMeta):

    _COMPRESSION_KEY = 'compression'
    _ENCRYPTION_KEY = 'encryption'
    _SIZE_KEY = 'size'
    _OBJECT_SIZE_KEY = 'object_size'
    _CHECKSUM_KEY = 'checksum'

    PACKAGE_PREFIX = 'benji.data_backends'
    _ENCRYPTION_PACKAGE_PREFIX = PACKAGE_PREFIX + '.encryption'
    _COMPRESSION_PACKAGE_PREFIX = PACKAGE_PREFIX + '.compression'

    # For the benefit of the file and B2 backends these must end in a slash
    _BLOCKS_PREFIX = 'blocks/'
    _VERSIONS_PREFIX = 'versions/'

    _META_SUFFIX = '.meta'

    def __init__(self, config):
        self.encryption = {}
        self.compression = {}
        self.active_encryption = None
        self.active_compression = None

        encryption_modules = config.get('dataBackend.encryption',
                                        None,
                                        types=list)
        if encryption_modules is not None:
            for encryption_module_dict in encryption_modules:
                type = config.get_from_dict(encryption_module_dict,
                                            'type',
                                            types=str)
                identifier = config.get_from_dict(encryption_module_dict,
                                                  'identifier',
                                                  types=str)
                materials = config.get_from_dict(encryption_module_dict,
                                                 'materials',
                                                 types=dict)
                try:
                    encryption_module = importlib.import_module('{}.{}'.format(
                        self._ENCRYPTION_PACKAGE_PREFIX, type))
                except ImportError:
                    raise ConfigurationError(
                        'Module file {}.{} not found or related import error.'.
                        format(self._ENCRYPTION_PACKAGE_PREFIX, type))
                else:
                    if type != encryption_module.Encryption.NAME:
                        raise InternalError(
                            'Encryption module type and name don\'t agree ({} != {}).'
                            .format(type, encryption_module.Encryption.NAME))

                    self.encryption[identifier] = encryption_module.Encryption(
                        identifier=identifier, materials=materials)

        active_encryption = config.get(
            'dataBackend.{}.activeEncryption'.format(self.NAME),
            None,
            types=str)
        if active_encryption is not None:
            if self.encryption and active_encryption in self.encryption:
                logger.info(
                    'Encryption is enabled for the {} data backend.'.format(
                        self.NAME))
                self.active_encryption = self.encryption[active_encryption]
            else:
                raise ConfigurationError(
                    'Encryption identifier {} is unknown.'.format(
                        active_encryption))

        compression_modules = config.get('dataBackend.compression',
                                         None,
                                         types=list)
        if compression_modules is not None:
            for compression_module_dict in compression_modules:
                type = config.get_from_dict(compression_module_dict,
                                            'type',
                                            types=str)
                materials = config.get_from_dict(compression_module_dict,
                                                 'materials',
                                                 None,
                                                 types=dict)
                try:
                    compression_module = importlib.import_module(
                        '{}.{}'.format(self._COMPRESSION_PACKAGE_PREFIX, type))
                except ImportError:
                    raise ConfigurationError(
                        'Module file {}.{} not found or related import error.'.
                        format(self._COMPRESSION_PACKAGE_PREFIX, type))
                else:
                    if type != compression_module.Compression.NAME:
                        raise InternalError(
                            'Compression module type and name don\'t agree ({} != {}).'
                            .format(type, compression_module.Compression.NAME))

                    self.compression[type] = compression_module.Compression(
                        materials=materials)

        active_compression = config.get(
            'dataBackend.{}.activeCompression'.format(self.NAME),
            None,
            types=str)
        if active_compression is not None:
            if self.compression and active_compression in self.compression:
                logger.info(
                    'Compression is enabled for the {} data backend.'.format(
                        self.NAME))
                self.active_compression = self.compression[active_compression]
            else:
                raise ConfigurationError(
                    'Compression type {} is unknown.'.format(
                        active_compression))

        simultaneous_writes = config.get('dataBackend.simultaneousWrites',
                                         types=int)
        simultaneous_reads = config.get('dataBackend.simultaneousReads',
                                        types=int)
        bandwidth_read = config.get('dataBackend.bandwidthRead', types=int)
        bandwidth_write = config.get('dataBackend.bandwidthWrite', types=int)

        self._consistency_check_writes = config.get(
            'dataBackend.consistencyCheckWrites'.format(self.NAME),
            False,
            types=bool)

        self._compression_statistics = {
            'objects_considered': 0,
            'objects_compressed': 0,
            'data_in': 0,
            'data_out': 0,
            'data_in_compression': 0,
            'data_out_compression': 0
        }

        self.read_throttling = TokenBucket()
        self.read_throttling.set_rate(bandwidth_read)  # 0 disables throttling
        self.write_throttling = TokenBucket()
        self.write_throttling.set_rate(
            bandwidth_write)  # 0 disables throttling

        self._read_executor = ThreadPoolExecutor(
            max_workers=simultaneous_reads,
            thread_name_prefix='DataBackend-Reader')
        self._read_futures = []
        self._read_semaphore = BoundedSemaphore(simultaneous_reads +
                                                self.READ_QUEUE_LENGTH)

        self._write_executor = ThreadPoolExecutor(
            max_workers=simultaneous_writes,
            thread_name_prefix='DataBackend-Writer')
        self._write_futures = []
        self._write_semaphore = BoundedSemaphore(simultaneous_writes +
                                                 self.WRITE_QUEUE_LENGTH)

    def _check_write(self, key, metadata_key, data, metadata):
        # Source: https://stackoverflow.com/questions/4527942/comparing-two-dictionaries-in-python
        def dict_compare(d1, d2):
            d1_keys = set(d1.keys())
            d2_keys = set(d2.keys())
            intersect_keys = d1_keys.intersection(d2_keys)
            added = d1_keys - d2_keys
            removed = d2_keys - d1_keys
            modified = {
                o: (d1[o], d2[o])
                for o in intersect_keys if d1[o] != d2[o]
            }
            same = set(o for o in intersect_keys if d1[o] == d2[o])
            return added, removed, modified, same

        rdata = self._read_object(key)
        rmetadata = self._read_object(metadata_key)
        rmetadata = json.loads(rmetadata.decode('utf-8'))

        if metadata:
            added, removed, modified, same = dict_compare(rmetadata, metadata)
            logger.debug(
                'Comparing written and read metadata of {}:'.format(key))
            logger.debug(
                '  added: {}, removed: {}, modified: {}, same: {}'.format(
                    added, removed, modified, same))
            if removed:
                raise InternalError(
                    'Consistency check: Metadata headers are missing in read back data: {}'
                    .format(', '.join(removed)))
            different_for = []
            for name in modified:
                logger.debug('Metadata differences: ')
                logger.debug('  {}: wrote {}, read {}'.format(
                    name, metadata[name], rmetadata[name]))
                if metadata[name] != rmetadata[name]:
                    different_for.append(name)
            if different_for:
                raise InternalError(
                    'Consistency check: Written and read metadata of {} are different for {}.'
                    .format(', '.join(different_for)))
        # Comparing encrypted/compressed data here
        if data != rdata:
            raise InternalError(
                'Consistency check: Written and read data of {} differ.'.
                format(key))

    def _write(self, block, data):
        data, metadata = self._compress(data)
        data, metadata_2 = self._encrypt(data)
        metadata.update(metadata_2)

        metadata[self._SIZE_KEY] = block.size
        metadata[self._OBJECT_SIZE_KEY] = len(data)
        metadata[self._CHECKSUM_KEY] = block.checksum
        metadata_json = json.dumps(metadata,
                                   separators=(',', ':')).encode('utf-8')

        logger.debug('Metadata of block {}: {}'.format(block.uid, metadata))

        key = self._block_uid_to_key(block.uid)
        metadata_key = key + self._META_SUFFIX

        time.sleep(
            self.write_throttling.consume(len(data) + len(metadata_json)))
        t1 = time.time()
        try:
            self._write_object(key, data)
            self._write_object(metadata_key, metadata_json)
        except:
            try:
                self._rm_object(key)
                self._rm_object(metadata_key)
            except FileNotFoundError:
                pass
            raise
        t2 = time.time()

        logger.debug('{} wrote data of uid {} in {:.2f}s'.format(
            threading.current_thread().name, block.uid, t2 - t1))
        if self._consistency_check_writes:
            self._check_write(key, metadata_key, data, metadata)

        return block

    def save(self, block, data, sync=False):
        if sync:
            self._write(block, data)
        else:
            self._write_semaphore.acquire()

            def write_with_release():
                try:
                    return self._write(block, data)
                except Exception:
                    raise
                finally:
                    self._write_semaphore.release()

            self._write_futures.append(
                self._write_executor.submit(write_with_release))

    def save_get_completed(self, timeout=None):
        """ Returns a generator for all completed read jobs
        """
        return future_results_as_completed(self._write_futures,
                                           timeout=timeout)

    def _read(self, block, metadata_only):
        key = self._block_uid_to_key(block.uid)
        metadata_key = key + self._META_SUFFIX
        t1 = time.time()
        if not metadata_only:
            data = self._read_object(key)
            data_length = len(data)
        else:
            data = None
            data_length = self._read_object_length(key)
        metadata = self._read_object(metadata_key)
        time.sleep(
            self.read_throttling.consume(
                len(data) if data else 0 + len(metadata)))
        t2 = time.time()

        metadata = json.loads(metadata.decode('utf-8'))
        if self._OBJECT_SIZE_KEY not in metadata:
            raise KeyError(
                'Required metadata key {} is missing for block {} (UID {}).'.
                format(self._OBJECT_SIZE_KEY, block.id, block.uid))

        if data_length != metadata[self._OBJECT_SIZE_KEY]:
            raise ValueError(
                'Mismatch between recorded object size and actual object size for block {} (UID {}). '
                'Expected: {}, got: {}.'.format(
                    block.id, block.uid, metadata[self._OBJECT_SIZE_KEY],
                    data_length))

        if not metadata_only:
            data = self._decrypt(data, metadata)
            data = self._uncompress(data, metadata)

        logger.debug('{} read data of uid {} in {:.2f}s{}'.format(
            threading.current_thread().name, block.uid, t2 - t1,
            ' (metadata only)' if metadata_only else ''))

        return block, data, metadata

    def read(self, block, sync=False, metadata_only=False):
        if sync:
            return self._read(block, metadata_only)[1]
        else:

            def read_with_acquire():
                self._read_semaphore.acquire()
                return self._read(block, metadata_only)

            self._read_futures.append(
                self._read_executor.submit(read_with_acquire))

    def read_get_completed(self, timeout=None):
        """ Returns a generator for all completed read jobs
        """
        return future_results_as_completed(self._read_futures,
                                           semaphore=self._read_semaphore,
                                           timeout=timeout)

    def check_block_metadata(self, *, block, data_length, metadata):
        for required_key in [self._SIZE_KEY, self._CHECKSUM_KEY]:
            if required_key not in metadata:
                raise KeyError(
                    'Required metadata key {} is missing for block {} (UID {}).'
                    .format(required_key, block.id, block.uid))

        if metadata[self._SIZE_KEY] != block.size:
            raise ValueError(
                'Mismatch between recorded block size and data length in metadata for block {} (UID {}). '
                'Expected: {}, got: {}.'.format(block.id, block.uid,
                                                block.size,
                                                metadata[self._SIZE_KEY]))

        if data_length and data_length != block.size:
            raise ValueError(
                'Mismatch between recorded block size and actual data length for block {} (UID {}). '
                'Expected: {}, got: {}.'.format(block.id, block.uid,
                                                block.size, data_length))

        if block.checksum != metadata[self._CHECKSUM_KEY]:
            raise ValueError(
                'Mismatch between recorded block checksum and checksum in metadata for block {} (UID {}). '
                'Expected: {}, got: {}.'.format(
                    block.id, block.uid, block.checksum[:16],
                    metadata[self._CHECKSUM_KEY][:16]))

    def rm(self, uid):
        key = self._block_uid_to_key(uid)
        metadata_key = key + self._META_SUFFIX
        try:
            self._rm_object(key)
        finally:
            try:
                self._rm_object(metadata_key)
            except FileNotFoundError:
                pass

    def rm_many(self, uids):
        keys = [self._block_uid_to_key(uid) for uid in uids]
        metadata_keys = [key + self._META_SUFFIX for key in keys]

        errors = self._rm_many_objects(keys)
        self._rm_many_objects(metadata_keys)
        return [self._key_to_block_uid(error) for error in errors]

    def list_blocks(self):
        keys = self._list_objects(self._BLOCKS_PREFIX)
        block_uids = []
        for key in keys:
            if key.endswith(self._META_SUFFIX):
                continue
            try:
                block_uids.append(self._key_to_block_uid(key))
            except (RuntimeError, ValueError):
                # Ignore any keys which don't match our pattern to account for stray objects/files
                pass
        return block_uids

    def list_versions(self):
        keys = self._list_objects(self._VERSIONS_PREFIX)
        version_uids = []
        for key in keys:
            if key.endswith(self._META_SUFFIX):
                continue
            try:
                version_uids.append(self._key_to_version_uid(key))
            except (RuntimeError, ValueError):
                # Ignore any keys which don't match our pattern to account for stray objects/files
                pass
        return version_uids

    def read_version(self, version_uid):
        key = self._version_uid_to_key(version_uid)
        metadata_key = key + self._META_SUFFIX
        data = self._read_object(key)
        metadata = self._read_object(metadata_key)

        metadata = json.loads(metadata.decode('utf-8'))
        for required_key in [self._OBJECT_SIZE_KEY, self._SIZE_KEY]:
            if required_key not in metadata:
                raise KeyError(
                    'Required metadata key {} is missing for object {}.'.
                    format(required_key, key))

        if len(data) != metadata[self._OBJECT_SIZE_KEY]:
            raise ValueError(
                'Length mismatch for object {}. Expected: {}, got: {}.'.format(
                    key, metadata[self.self._OBJECT_SIZE_KEY], len(data)))

        data = self._decrypt(data, metadata)
        data = self._uncompress(data, metadata)

        if len(data) != metadata[self._SIZE_KEY]:
            raise ValueError(
                'Length mismatch of original data for object {}. Expected: {}, got: {}.'
                .format(key, metadata[self.self._SIZE_KEY], len(data)))

        data = data.decode('utf-8')
        return data

    def save_version(self, version_uid, data, overwrite=False):
        key = self._version_uid_to_key(version_uid)
        metadata_key = key + self._META_SUFFIX

        if not overwrite:
            try:
                self._read_object(key)
            except FileNotFoundError:
                pass
            else:
                raise FileExistsError(
                    'Version {} already exists in data backend.'.format(
                        version_uid.readable))

        data = data.encode('utf-8')
        size = len(data)
        data, metadata = self._compress(data)
        data, metadata_2 = self._encrypt(data)
        metadata.update(metadata_2)

        metadata[self._SIZE_KEY] = size
        metadata[self._OBJECT_SIZE_KEY] = len(data)
        metadata_json = json.dumps(metadata,
                                   separators=(',', ':')).encode('utf-8')

        try:
            self._write_object(key, data)
            self._write_object(metadata_key, metadata_json)
        except:
            try:
                self._rm_object(key)
                self._rm_object(metadata_key)
            except FileNotFoundError:
                pass
            raise

        if self._consistency_check_writes:
            self._check_write(key, metadata_key, data, metadata)

    def rm_version(self, version_uid):
        key = self._version_uid_to_key(version_uid)
        metadata_key = key + self._META_SUFFIX
        try:
            self._rm_object(key)
        finally:
            try:
                self._rm_object(metadata_key)
            except FileNotFoundError:
                pass

    def _encrypt(self, data):
        if self.active_encryption is not None:
            data, materials = self.active_encryption.encrypt(data=data)
            metadata = {
                self._ENCRYPTION_KEY: {
                    'identifier': self.active_encryption.identifier,
                    'type': self.active_encryption.NAME,
                    'materials': materials
                }
            }
            return data, metadata
        else:
            return data, {}

    def _decrypt(self, data, metadata):
        if self._ENCRYPTION_KEY in metadata:
            identifier = metadata[self._ENCRYPTION_KEY]['identifier']
            type = metadata[self._ENCRYPTION_KEY]['type']
            if identifier in self.encryption:
                encryption = self.encryption[identifier]
                if type != encryption.NAME:
                    raise ConfigurationError(
                        'Mismatch between object encryption type and configured type for identifier '
                        + '{} ({} != {})'.format(identifier, type,
                                                 encryption.NAME))

                return encryption.decrypt(
                    data=data,
                    materials=metadata[self._ENCRYPTION_KEY]['materials'])
            else:
                raise IOError(
                    'Unknown encryption identifier {} in object metadata.'.
                    format(identifier))
        else:
            return data

    def _compress(self, data):
        self._compression_statistics['objects_considered'] += 1
        self._compression_statistics['data_in'] += len(data)

        if self.active_compression is not None:
            compressed_data, materials = self.active_compression.compress(
                data=data)
            if len(compressed_data) < len(data):
                self._compression_statistics['objects_compressed'] += 1
                self._compression_statistics['data_in_compression'] += len(
                    data)
                self._compression_statistics['data_out_compression'] += len(
                    compressed_data)
                self._compression_statistics['data_out'] += len(
                    compressed_data)

                metadata = {
                    self._COMPRESSION_KEY: {
                        'type': self.active_compression.NAME,
                        'materials': materials
                    }
                }
                return compressed_data, metadata
            else:
                self._compression_statistics['data_out'] += len(data)
                return data, {}
        else:
            self._compression_statistics['data_out'] += len(data)
            return data, {}

    def _uncompress(self, data, metadata):
        if self._COMPRESSION_KEY in metadata:
            type = metadata[self._COMPRESSION_KEY]['type']
            if type in self.compression:
                return self.compression[type].uncompress(
                    data=data,
                    materials=metadata[self._COMPRESSION_KEY]['materials'],
                    original_size=metadata[self._SIZE_KEY])
            else:
                raise IOError(
                    'Unsupported compression type {} in object metadata.'.
                    format(type))
        else:
            return data

    def wait_reads_finished(self):
        concurrent.futures.wait(self._read_futures)

    def wait_saves_finished(self):
        concurrent.futures.wait(self._write_futures)

    def use_read_cache(self, enable):
        return False

    def _log_compression_statistics(self):
        if self.active_compression is None or self._compression_statistics[
                'objects_considered'] == 0:
            return

        overall_ratio, ratio = 0.0, 0.0
        if self._compression_statistics['data_out'] > 0:
            overall_ratio = self._compression_statistics[
                'data_in'] / self._compression_statistics['data_out']

        if self._compression_statistics['data_out_compression'] > 0:
            ratio = self._compression_statistics['data_in_compression'] \
                    / self._compression_statistics['data_out_compression']

        tbl = PrettyTable()
        tbl.field_names = [
            'Objects considered', 'Objects compressed', 'Data in', 'Data out',
            'Overall compression ratio', 'Data input to compression',
            'Data output from compression', 'Compression ratio'
        ]
        tbl.align['Objects considered'] = 'r'
        tbl.align['Objects compressed'] = 'r'
        tbl.align['Data in'] = 'r'
        tbl.align['Data out'] = 'r'
        tbl.align['Overall compression ratio'] = 'r'
        tbl.align['Data input to compression'] = 'r'
        tbl.align['Data output from compression'] = 'r'
        tbl.align['Compression ratio'] = 'r'
        tbl.add_row([
            self._compression_statistics['objects_considered'],
            self._compression_statistics['objects_compressed'],
            self._compression_statistics['data_in'],
            self._compression_statistics['data_out'],
            '{:.2f}'.format(overall_ratio),
            self._compression_statistics['data_in_compression'],
            self._compression_statistics['data_out_compression'],
            '{:.2f}'.format(ratio)
        ])
        logger.info('Compression statistics:  \n' +
                    textwrap.indent(str(tbl), '          '))

    def close(self):
        self._log_compression_statistics()

        if len(self._read_futures) > 0:
            logger.warning(
                'Data backend closed with {} outstanding read jobs, cancelling them.'
                .format(len(self._read_futures)))
            for future in self._read_futures:
                future.cancel()
            logger.debug('Data backend cancelled all outstanding read jobs.')
            # Get all jobs so that the semaphore gets released and still waiting jobs can complete
            for future in self.read_get_completed():
                pass
            logger.debug(
                'Data backend read results from all outstanding read jobs.')
        if len(self._write_futures) > 0:
            logger.warning(
                'Data backend closed with {} outstanding write jobs, cancelling them.'
                .format(len(self._write_futures)))
            for future in self._write_futures:
                future.cancel()
            logger.debug('Data backend cancelled all outstanding write jobs.')
            # Write jobs release their semaphore at completion so we don't need to collect the results
            self._write_futures = []
        self._write_executor.shutdown()
        self._read_executor.shutdown()

    def _block_uid_to_key(self, block_uid):
        key_name = '{:016x}-{:016x}'.format(block_uid.left, block_uid.right)
        digest = hashlib.md5(key_name.encode('ascii')).hexdigest()
        return '{}{}/{}/{}-{}'.format(self._BLOCKS_PREFIX, digest[0:2],
                                      digest[2:4], digest[:8], key_name)

    def _key_to_block_uid(self, key):
        bpl = len(self._BLOCKS_PREFIX)
        if len(key) != 48 + bpl:
            raise RuntimeError('Invalid key name {}'.format(key))
        return BlockUid(int(key[15 + bpl:15 + bpl + 16], 16),
                        int(key[32 + bpl:32 + bpl + 16], 16))

    def _version_uid_to_key(self, version_uid):
        return '{}{}/{}/{}'.format(self._VERSIONS_PREFIX,
                                   version_uid.readable[-1:],
                                   version_uid.readable[-2:-1],
                                   version_uid.readable)

    def _key_to_version_uid(self, key):
        vpl = len(self._VERSIONS_PREFIX)
        vl = len(VersionUid(1).readable)
        if len(key) != vpl + vl + 4:
            raise RuntimeError('Invalid key name {}'.format(key))
        return VersionUid.create_from_readables(key[vpl + 4:vpl + vl + 4])

    @abstractmethod
    def _write_object(self, key, data):
        raise NotImplementedError

    @abstractmethod
    def _read_object(self, key):
        raise NotImplementedError

    @abstractmethod
    def _read_object_length(self, key):
        raise NotImplementedError

    @abstractmethod
    def _rm_object(self):
        raise NotImplementedError

    @abstractmethod
    def _rm_many_objects(self):
        raise NotImplementedError

    @abstractmethod
    def _list_objects(self):
        raise NotImplementedError