class StorageBase(ReprMixIn, metaclass=ABCMeta): _CHECKSUM_KEY = 'checksum' _CREATED_KEY = 'created' _MODIFIED_KEY = 'modified' _HMAC_KEY = 'hmac' _METADATA_VERSION_KEY = 'metadata_version' _OBJECT_SIZE_KEY = 'object_size' _SIZE_KEY = 'size' _TRANSFORMS_KEY = 'transforms' _META_SUFFIX = '.meta' def __init__(self, *, config: Config, name: str, storage_id: int, module_configuration: ConfigDict) -> None: self._name = name self._storage_id = storage_id self._active_transforms: List[TransformBase] = [] active_transforms = Config.get_from_dict(module_configuration, 'activeTransforms', None, types=list) if active_transforms is not None: for transform in active_transforms: self._active_transforms.append(TransformFactory.get_by_name(transform)) logger.info('Active transforms for storage {}: {}.'.format( name, ', '.join( ['{} ({})'.format(transform.name, transform.module) for transform in self._active_transforms]))) simultaneous_writes = Config.get_from_dict(module_configuration, 'simultaneousWrites', types=int) simultaneous_reads = Config.get_from_dict(module_configuration, 'simultaneousReads', types=int) simultaneous_removals = Config.get_from_dict(module_configuration, 'simultaneousRemovals', types=int) bandwidth_read = Config.get_from_dict(module_configuration, 'bandwidthRead', types=int) bandwidth_write = Config.get_from_dict(module_configuration, 'bandwidthWrite', types=int) self._consistency_check_writes = Config.get_from_dict( module_configuration, 'consistencyCheckWrites', False, types=bool) hmac_key_encoded = Config.get_from_dict(module_configuration, 'hmac.key', None, types=str) hmac_key: Optional[bytes] = None if hmac_key_encoded is None: hmac_password = Config.get_from_dict(module_configuration, 'hmac.password', None, types=str) if hmac_password is not None: hmac_kdf_salt = base64.b64decode(Config.get_from_dict(module_configuration, 'hmac.kdfSalt', types=str)) hmac_kdf_iterations = Config.get_from_dict(module_configuration, 'hmac.kdfIterations', types=int) hmac_key = derive_key( salt=hmac_kdf_salt, iterations=hmac_kdf_iterations, key_length=32, password=hmac_password) else: hmac_key = base64.b64decode(hmac_key_encoded) self._dict_hmac: Optional[DictHMAC] = None if hmac_key is not None: logger.info('Enabling HMAC object metadata integrity protection for storage {}.'.format(name)) self._dict_hmac = DictHMAC(hmac_key=self._HMAC_KEY, secret_key=hmac_key) self.read_throttling = TokenBucket() self.read_throttling.set_rate(bandwidth_read) # 0 disables throttling self.write_throttling = TokenBucket() self.write_throttling.set_rate(bandwidth_write) # 0 disables throttling self._read_executor = JobExecutor(name='Storage-Read', workers=simultaneous_reads, blocking_submit=False) self._write_executor = JobExecutor(name='Storage-Write', workers=simultaneous_writes, blocking_submit=True) self._remove_executor = JobExecutor(name='Storage-Remove', workers=simultaneous_removals, blocking_submit=True) @property def name(self) -> str: return self._name @property def storage_id(self) -> int: return self._storage_id def _build_metadata(self, *, size: int, object_size: int, transforms_metadata: List[Dict] = None, checksum: str = None) -> Tuple[Dict, bytes]: timestamp = datetime.datetime.utcnow().isoformat(timespec='microseconds') metadata: Dict = { self._CREATED_KEY: timestamp, self._METADATA_VERSION_KEY: str(VERSIONS.object_metadata.current), self._MODIFIED_KEY: timestamp, self._OBJECT_SIZE_KEY: object_size, self._SIZE_KEY: size, } if checksum: metadata[self._CHECKSUM_KEY] = checksum if transforms_metadata: metadata[self._TRANSFORMS_KEY] = transforms_metadata if self._dict_hmac: self._dict_hmac.add_digest(metadata) return metadata, json.dumps(metadata, separators=(',', ':')).encode('utf-8') def _decode_metadata(self, *, metadata_json: bytes, key: str, data_length: int) -> Dict: metadata = json.loads(metadata_json.decode('utf-8')) if self._dict_hmac: self._dict_hmac.verify_digest(metadata) # We currently support only one object metadata version if self._METADATA_VERSION_KEY not in metadata: raise KeyError('Required object metadata key {} is missing for object {}.'.format( self._METADATA_VERSION_KEY, key)) version_obj = semantic_version.Version(metadata[self._METADATA_VERSION_KEY]) if version_obj not in VERSIONS.object_metadata.supported: raise ValueError('Unsupported object metadata version: "{}".'.format(str(version_obj))) for required_key in [self._CREATED_KEY, self._MODIFIED_KEY, self._OBJECT_SIZE_KEY, self._SIZE_KEY]: if required_key not in metadata: raise KeyError('Required object metadata key {} is missing for object {}.'.format(required_key, key)) if data_length != metadata[self._OBJECT_SIZE_KEY]: raise ValueError('Length mismatch for object {}. Expected: {}, got: {}.'.format( key, metadata[self._OBJECT_SIZE_KEY], data_length)) return metadata def _check_write(self, *, key: str, metadata_key: str, data_expected: bytes) -> None: data_actual = self._read_object(key) metadata_actual_json = self._read_object(metadata_key) # Return value is ignored self._decode_metadata(metadata_json=metadata_actual_json, key=key, data_length=len(data_actual)) # Comparing encapsulated data here if data_expected != data_actual: raise ValueError('Written and read data of {} differ.'.format(key)) def _write(self, block: DereferencedBlock, data: bytes) -> DereferencedBlock: data, transforms_metadata = self._encapsulate(data) metadata, metadata_json = self._build_metadata( size=block.size, object_size=len(data), checksum=block.checksum, transforms_metadata=transforms_metadata) key = block.uid.storage_object_to_path() metadata_key = key + self._META_SUFFIX time.sleep(self.write_throttling.consume(len(data) + len(metadata_json))) t1 = time.time() try: self._write_object(key, data) self._write_object(metadata_key, metadata_json) except: try: self._rm_object(key) self._rm_object(metadata_key) except FileNotFoundError: pass raise t2 = time.time() logger.debug('{} wrote data of uid {} in {:.2f}s'.format(threading.current_thread().name, block.uid, t2 - t1)) if self._consistency_check_writes: try: self._check_write(key=key, metadata_key=metadata_key, data_expected=data) except (KeyError, ValueError) as exception: raise InvalidBlockException('Check write of block {} (UID {}) failed.'.format(block.id, block.uid), block) from exception return block def write_block_async(self, block: Union[DereferencedBlock, Block], data: bytes) -> None: block_deref = block.deref() if isinstance(block, Block) else block def job(): return self._write(block_deref, data) self._write_executor.submit(job) def write_block(self, block: Union[DereferencedBlock, Block], data: bytes) -> None: block_deref = block.deref() if isinstance(block, Block) else block self._write(block_deref, data) def write_get_completed(self, timeout: int = None) -> Iterator[Union[DereferencedBlock, BaseException]]: return self._write_executor.get_completed(timeout=timeout) def _read(self, block: DereferencedBlock, metadata_only: bool) -> Tuple[DereferencedBlock, Optional[bytes], Dict]: key = block.uid.storage_object_to_path() metadata_key = key + self._META_SUFFIX data: Optional[bytes] = None try: t1 = time.time() if not metadata_only: data = self._read_object(key) data_length = len(data) else: data_length = self._read_object_length(key) metadata_json = self._read_object(metadata_key) time.sleep(self.read_throttling.consume(len(data) if data else 0 + len(metadata_json))) t2 = time.time() except FileNotFoundError as exception: raise InvalidBlockException( 'Object metadata or data of block {} (UID{}) not found.'.format(block.id, block.uid), block) from exception try: metadata = self._decode_metadata(metadata_json=metadata_json, key=key, data_length=data_length) except (KeyError, ValueError) as exception: raise InvalidBlockException('Object metadata of block {} (UID{}) is invalid.'.format(block.id, block.uid), block) from exception if self._CHECKSUM_KEY not in metadata: raise InvalidBlockException( 'Required object metadata key {} is missing for block {} (UID {}).'.format( self._CHECKSUM_KEY, block.id, block.uid), block) if not metadata_only and self._TRANSFORMS_KEY in metadata: data = self._decapsulate(data, metadata[self._TRANSFORMS_KEY]) # type: ignore logger.debug('{} read data of uid {} in {:.2f}s{}'.format(threading.current_thread().name, block.uid, t2 - t1, ' (metadata only)' if metadata_only else '')) return block, data, metadata def read_block_async(self, block: Block, metadata_only: bool = False) -> None: def job(): return self._read(block.deref(), metadata_only) self._read_executor.submit(job) def read_block(self, block: Block, metadata_only: bool = False) -> Optional[bytes]: return self._read(block.deref(), metadata_only)[1] def read_get_completed(self, timeout: int = None) -> Iterator[Union[Tuple[DereferencedBlock, bytes, Dict], BaseException]]: return self._read_executor.get_completed(timeout=timeout) def check_block_metadata(self, *, block: DereferencedBlock, data_length: Optional[int], metadata: Dict) -> None: # Existence of keys has already been checked in _decode_metadata() and _read() if metadata[self._SIZE_KEY] != block.size: raise ValueError( 'Mismatch between recorded block size and data length in object metadata for block {} (UID {}). ' 'Expected: {}, got: {}.'.format(block.id, block.uid, block.size, metadata[self._SIZE_KEY])) if data_length and data_length != block.size: raise ValueError('Mismatch between recorded block size and actual data length for block {} (UID {}). ' 'Expected: {}, got: {}.'.format(block.id, block.uid, block.size, data_length)) if block.checksum != metadata[self._CHECKSUM_KEY]: raise ValueError( 'Mismatch between recorded block checksum and checksum in object metadata for block {} (UID {}). ' 'Expected: {}, got: {}.'.format( block.id, block.uid, cast(str, block.checksum)[:16], # We know that block.checksum is set metadata[self._CHECKSUM_KEY][:16])) def _rm_block(self, uid: BlockUid) -> BlockUid: key = uid.storage_object_to_path() metadata_key = key + self._META_SUFFIX try: self._rm_object(key) except FileNotFoundError as exception: raise BlockNotFoundError('Block UID {} not found on storage.'.format(str(uid)), uid) from exception finally: try: self._rm_object(metadata_key) except FileNotFoundError: pass return uid def rm_block_async(self, uid: BlockUid) -> None: def job(): return self._rm_block(uid) self._remove_executor.submit(job) def rm_block(self, uid: BlockUid) -> None: self._rm_block(uid) def rm_get_completed(self, timeout: int = None) -> Iterator[Union[BlockUid, BaseException]]: return self._remove_executor.get_completed(timeout=timeout) def wait_rms_finished(self): self._remove_executor.wait_for_all() # def rm_many_blocks(self, uids: Union[Sequence[BlockUid], AbstractSet[BlockUid]]) -> List[BlockUid]: # keys = [uid.storage_object_to_path() for uid in uids] # metadata_keys = [key + self._META_SUFFIX for key in keys] # # errors = self._rm_many_objects(keys) # self._rm_many_objects(metadata_keys) # return [cast(BlockUid, BlockUid.storage_path_to_object(error)) for error in errors] def list_blocks(self) -> Iterable[BlockUid]: keys = self._list_objects(BlockUid.storage_prefix()) for key in keys: assert isinstance(key, str) if key.endswith(self._META_SUFFIX): continue try: yield cast(BlockUid, BlockUid.storage_path_to_object(key)) except (RuntimeError, ValueError): # Ignore any keys which don't match our pattern to account for stray objects/files pass def list_versions(self) -> Iterable[VersionUid]: keys = self._list_objects(VersionUid.storage_prefix()) for key in keys: assert isinstance(key, str) if key.endswith(self._META_SUFFIX): continue try: yield cast(VersionUid, VersionUid.storage_path_to_object(key)) except (RuntimeError, ValueError): # Ignore any keys which don't match our pattern to account for stray objects/files pass def read_version(self, version_uid: VersionUid) -> str: key = version_uid.storage_object_to_path() metadata_key = key + self._META_SUFFIX data = self._read_object(key) metadata_json = self._read_object(metadata_key) metadata = self._decode_metadata(metadata_json=metadata_json, key=key, data_length=len(data)) if self._TRANSFORMS_KEY in metadata: data = self._decapsulate(data, metadata[self._TRANSFORMS_KEY]) if len(data) != metadata[self._SIZE_KEY]: raise ValueError('Length mismatch of original data for object {}. Expected: {}, got: {}.'.format( key, metadata[self._SIZE_KEY], len(data))) return data.decode('utf-8') def write_version(self, version_uid: VersionUid, data: str, overwrite: Optional[bool] = False) -> None: key = version_uid.storage_object_to_path() metadata_key = key + self._META_SUFFIX if not overwrite: try: self._read_object(key) except FileNotFoundError: pass else: raise FileExistsError('Version {} already exists in storage.'.format(version_uid.v_string)) data_bytes = data.encode('utf-8') size = len(data_bytes) data_bytes, transforms_metadata = self._encapsulate(data_bytes) metadata, metadata_json = self._build_metadata( size=size, object_size=len(data_bytes), transforms_metadata=transforms_metadata) try: self._write_object(key, data_bytes) self._write_object(metadata_key, metadata_json) except: try: self._rm_object(key) self._rm_object(metadata_key) except FileNotFoundError: pass raise if self._consistency_check_writes: self._check_write(key=key, metadata_key=metadata_key, data_expected=data_bytes) def rm_version(self, version_uid: VersionUid) -> None: key = version_uid.storage_object_to_path() metadata_key = key + self._META_SUFFIX try: self._rm_object(key) finally: try: self._rm_object(metadata_key) except FileNotFoundError: pass def storage_stats(self) -> Tuple[int, int]: objects_count = 0 objects_size = 0 for key, size in cast(Iterable[Tuple[str, int]], self._list_objects(include_size=True)): objects_count += 1 objects_size += size return objects_count, objects_size def _encapsulate(self, data: bytes) -> Tuple[bytes, List]: if self._active_transforms is not None: transforms_metadata = [] for transform in self._active_transforms: data_encapsulated, materials = transform.encapsulate(data=data) if data_encapsulated: transforms_metadata.append({ 'name': transform.name, 'module': transform.module, 'materials': materials, }) data = data_encapsulated return data, transforms_metadata else: return data, [] def _decapsulate(self, data: bytes, transforms_metadata: Sequence[Dict]) -> bytes: for element in reversed(transforms_metadata): name = element['name'] module = element['module'] transform = TransformFactory.get_by_name(name) if transform: if module != transform.module: raise ConfigurationError('Mismatch between object transform module and configured module for ' + '{} ({} != {})'.format(name, module, transform.module)) data = transform.decapsulate(data=data, materials=element['materials']) else: raise IOError('Unknown transform {} in object metadata.'.format(name)) return data def wait_writes_finished(self) -> None: self._write_executor.wait_for_all() def use_read_cache(self, enable: bool) -> bool: return False def close(self) -> None: self._read_executor.shutdown() self._write_executor.shutdown() self._remove_executor.shutdown() @abstractmethod def _write_object(self, key: str, data: bytes): raise NotImplementedError @abstractmethod def _read_object(self, key: str) -> bytes: raise NotImplementedError @abstractmethod def _read_object_length(self, key: str) -> int: raise NotImplementedError @abstractmethod def _rm_object(self, key: str) -> None: raise NotImplementedError @abstractmethod def _list_objects(self, prefix: str = None, include_size: bool = False) -> Union[Iterable[str], Iterable[Tuple[str, int]]]: raise NotImplementedError
class DataBackend(metaclass=ABCMeta): _COMPRESSION_KEY = 'compression' _ENCRYPTION_KEY = 'encryption' _SIZE_KEY = 'size' _OBJECT_SIZE_KEY = 'object_size' _CHECKSUM_KEY = 'checksum' PACKAGE_PREFIX = 'benji.data_backends' _ENCRYPTION_PACKAGE_PREFIX = PACKAGE_PREFIX + '.encryption' _COMPRESSION_PACKAGE_PREFIX = PACKAGE_PREFIX + '.compression' # For the benefit of the file and B2 backends these must end in a slash _BLOCKS_PREFIX = 'blocks/' _VERSIONS_PREFIX = 'versions/' _META_SUFFIX = '.meta' def __init__(self, config): self.encryption = {} self.compression = {} self.active_encryption = None self.active_compression = None encryption_modules = config.get('dataBackend.encryption', None, types=list) if encryption_modules is not None: for encryption_module_dict in encryption_modules: type = config.get_from_dict(encryption_module_dict, 'type', types=str) identifier = config.get_from_dict(encryption_module_dict, 'identifier', types=str) materials = config.get_from_dict(encryption_module_dict, 'materials', types=dict) try: encryption_module = importlib.import_module('{}.{}'.format( self._ENCRYPTION_PACKAGE_PREFIX, type)) except ImportError: raise ConfigurationError( 'Module file {}.{} not found or related import error.'. format(self._ENCRYPTION_PACKAGE_PREFIX, type)) else: if type != encryption_module.Encryption.NAME: raise InternalError( 'Encryption module type and name don\'t agree ({} != {}).' .format(type, encryption_module.Encryption.NAME)) self.encryption[identifier] = encryption_module.Encryption( identifier=identifier, materials=materials) active_encryption = config.get( 'dataBackend.{}.activeEncryption'.format(self.NAME), None, types=str) if active_encryption is not None: if self.encryption and active_encryption in self.encryption: logger.info( 'Encryption is enabled for the {} data backend.'.format( self.NAME)) self.active_encryption = self.encryption[active_encryption] else: raise ConfigurationError( 'Encryption identifier {} is unknown.'.format( active_encryption)) compression_modules = config.get('dataBackend.compression', None, types=list) if compression_modules is not None: for compression_module_dict in compression_modules: type = config.get_from_dict(compression_module_dict, 'type', types=str) materials = config.get_from_dict(compression_module_dict, 'materials', None, types=dict) try: compression_module = importlib.import_module( '{}.{}'.format(self._COMPRESSION_PACKAGE_PREFIX, type)) except ImportError: raise ConfigurationError( 'Module file {}.{} not found or related import error.'. format(self._COMPRESSION_PACKAGE_PREFIX, type)) else: if type != compression_module.Compression.NAME: raise InternalError( 'Compression module type and name don\'t agree ({} != {}).' .format(type, compression_module.Compression.NAME)) self.compression[type] = compression_module.Compression( materials=materials) active_compression = config.get( 'dataBackend.{}.activeCompression'.format(self.NAME), None, types=str) if active_compression is not None: if self.compression and active_compression in self.compression: logger.info( 'Compression is enabled for the {} data backend.'.format( self.NAME)) self.active_compression = self.compression[active_compression] else: raise ConfigurationError( 'Compression type {} is unknown.'.format( active_compression)) simultaneous_writes = config.get('dataBackend.simultaneousWrites', types=int) simultaneous_reads = config.get('dataBackend.simultaneousReads', types=int) bandwidth_read = config.get('dataBackend.bandwidthRead', types=int) bandwidth_write = config.get('dataBackend.bandwidthWrite', types=int) self._consistency_check_writes = config.get( 'dataBackend.consistencyCheckWrites'.format(self.NAME), False, types=bool) self._compression_statistics = { 'objects_considered': 0, 'objects_compressed': 0, 'data_in': 0, 'data_out': 0, 'data_in_compression': 0, 'data_out_compression': 0 } self.read_throttling = TokenBucket() self.read_throttling.set_rate(bandwidth_read) # 0 disables throttling self.write_throttling = TokenBucket() self.write_throttling.set_rate( bandwidth_write) # 0 disables throttling self._read_executor = ThreadPoolExecutor( max_workers=simultaneous_reads, thread_name_prefix='DataBackend-Reader') self._read_futures = [] self._read_semaphore = BoundedSemaphore(simultaneous_reads + self.READ_QUEUE_LENGTH) self._write_executor = ThreadPoolExecutor( max_workers=simultaneous_writes, thread_name_prefix='DataBackend-Writer') self._write_futures = [] self._write_semaphore = BoundedSemaphore(simultaneous_writes + self.WRITE_QUEUE_LENGTH) def _check_write(self, key, metadata_key, data, metadata): # Source: https://stackoverflow.com/questions/4527942/comparing-two-dictionaries-in-python def dict_compare(d1, d2): d1_keys = set(d1.keys()) d2_keys = set(d2.keys()) intersect_keys = d1_keys.intersection(d2_keys) added = d1_keys - d2_keys removed = d2_keys - d1_keys modified = { o: (d1[o], d2[o]) for o in intersect_keys if d1[o] != d2[o] } same = set(o for o in intersect_keys if d1[o] == d2[o]) return added, removed, modified, same rdata = self._read_object(key) rmetadata = self._read_object(metadata_key) rmetadata = json.loads(rmetadata.decode('utf-8')) if metadata: added, removed, modified, same = dict_compare(rmetadata, metadata) logger.debug( 'Comparing written and read metadata of {}:'.format(key)) logger.debug( ' added: {}, removed: {}, modified: {}, same: {}'.format( added, removed, modified, same)) if removed: raise InternalError( 'Consistency check: Metadata headers are missing in read back data: {}' .format(', '.join(removed))) different_for = [] for name in modified: logger.debug('Metadata differences: ') logger.debug(' {}: wrote {}, read {}'.format( name, metadata[name], rmetadata[name])) if metadata[name] != rmetadata[name]: different_for.append(name) if different_for: raise InternalError( 'Consistency check: Written and read metadata of {} are different for {}.' .format(', '.join(different_for))) # Comparing encrypted/compressed data here if data != rdata: raise InternalError( 'Consistency check: Written and read data of {} differ.'. format(key)) def _write(self, block, data): data, metadata = self._compress(data) data, metadata_2 = self._encrypt(data) metadata.update(metadata_2) metadata[self._SIZE_KEY] = block.size metadata[self._OBJECT_SIZE_KEY] = len(data) metadata[self._CHECKSUM_KEY] = block.checksum metadata_json = json.dumps(metadata, separators=(',', ':')).encode('utf-8') logger.debug('Metadata of block {}: {}'.format(block.uid, metadata)) key = self._block_uid_to_key(block.uid) metadata_key = key + self._META_SUFFIX time.sleep( self.write_throttling.consume(len(data) + len(metadata_json))) t1 = time.time() try: self._write_object(key, data) self._write_object(metadata_key, metadata_json) except: try: self._rm_object(key) self._rm_object(metadata_key) except FileNotFoundError: pass raise t2 = time.time() logger.debug('{} wrote data of uid {} in {:.2f}s'.format( threading.current_thread().name, block.uid, t2 - t1)) if self._consistency_check_writes: self._check_write(key, metadata_key, data, metadata) return block def save(self, block, data, sync=False): if sync: self._write(block, data) else: self._write_semaphore.acquire() def write_with_release(): try: return self._write(block, data) except Exception: raise finally: self._write_semaphore.release() self._write_futures.append( self._write_executor.submit(write_with_release)) def save_get_completed(self, timeout=None): """ Returns a generator for all completed read jobs """ return future_results_as_completed(self._write_futures, timeout=timeout) def _read(self, block, metadata_only): key = self._block_uid_to_key(block.uid) metadata_key = key + self._META_SUFFIX t1 = time.time() if not metadata_only: data = self._read_object(key) data_length = len(data) else: data = None data_length = self._read_object_length(key) metadata = self._read_object(metadata_key) time.sleep( self.read_throttling.consume( len(data) if data else 0 + len(metadata))) t2 = time.time() metadata = json.loads(metadata.decode('utf-8')) if self._OBJECT_SIZE_KEY not in metadata: raise KeyError( 'Required metadata key {} is missing for block {} (UID {}).'. format(self._OBJECT_SIZE_KEY, block.id, block.uid)) if data_length != metadata[self._OBJECT_SIZE_KEY]: raise ValueError( 'Mismatch between recorded object size and actual object size for block {} (UID {}). ' 'Expected: {}, got: {}.'.format( block.id, block.uid, metadata[self._OBJECT_SIZE_KEY], data_length)) if not metadata_only: data = self._decrypt(data, metadata) data = self._uncompress(data, metadata) logger.debug('{} read data of uid {} in {:.2f}s{}'.format( threading.current_thread().name, block.uid, t2 - t1, ' (metadata only)' if metadata_only else '')) return block, data, metadata def read(self, block, sync=False, metadata_only=False): if sync: return self._read(block, metadata_only)[1] else: def read_with_acquire(): self._read_semaphore.acquire() return self._read(block, metadata_only) self._read_futures.append( self._read_executor.submit(read_with_acquire)) def read_get_completed(self, timeout=None): """ Returns a generator for all completed read jobs """ return future_results_as_completed(self._read_futures, semaphore=self._read_semaphore, timeout=timeout) def check_block_metadata(self, *, block, data_length, metadata): for required_key in [self._SIZE_KEY, self._CHECKSUM_KEY]: if required_key not in metadata: raise KeyError( 'Required metadata key {} is missing for block {} (UID {}).' .format(required_key, block.id, block.uid)) if metadata[self._SIZE_KEY] != block.size: raise ValueError( 'Mismatch between recorded block size and data length in metadata for block {} (UID {}). ' 'Expected: {}, got: {}.'.format(block.id, block.uid, block.size, metadata[self._SIZE_KEY])) if data_length and data_length != block.size: raise ValueError( 'Mismatch between recorded block size and actual data length for block {} (UID {}). ' 'Expected: {}, got: {}.'.format(block.id, block.uid, block.size, data_length)) if block.checksum != metadata[self._CHECKSUM_KEY]: raise ValueError( 'Mismatch between recorded block checksum and checksum in metadata for block {} (UID {}). ' 'Expected: {}, got: {}.'.format( block.id, block.uid, block.checksum[:16], metadata[self._CHECKSUM_KEY][:16])) def rm(self, uid): key = self._block_uid_to_key(uid) metadata_key = key + self._META_SUFFIX try: self._rm_object(key) finally: try: self._rm_object(metadata_key) except FileNotFoundError: pass def rm_many(self, uids): keys = [self._block_uid_to_key(uid) for uid in uids] metadata_keys = [key + self._META_SUFFIX for key in keys] errors = self._rm_many_objects(keys) self._rm_many_objects(metadata_keys) return [self._key_to_block_uid(error) for error in errors] def list_blocks(self): keys = self._list_objects(self._BLOCKS_PREFIX) block_uids = [] for key in keys: if key.endswith(self._META_SUFFIX): continue try: block_uids.append(self._key_to_block_uid(key)) except (RuntimeError, ValueError): # Ignore any keys which don't match our pattern to account for stray objects/files pass return block_uids def list_versions(self): keys = self._list_objects(self._VERSIONS_PREFIX) version_uids = [] for key in keys: if key.endswith(self._META_SUFFIX): continue try: version_uids.append(self._key_to_version_uid(key)) except (RuntimeError, ValueError): # Ignore any keys which don't match our pattern to account for stray objects/files pass return version_uids def read_version(self, version_uid): key = self._version_uid_to_key(version_uid) metadata_key = key + self._META_SUFFIX data = self._read_object(key) metadata = self._read_object(metadata_key) metadata = json.loads(metadata.decode('utf-8')) for required_key in [self._OBJECT_SIZE_KEY, self._SIZE_KEY]: if required_key not in metadata: raise KeyError( 'Required metadata key {} is missing for object {}.'. format(required_key, key)) if len(data) != metadata[self._OBJECT_SIZE_KEY]: raise ValueError( 'Length mismatch for object {}. Expected: {}, got: {}.'.format( key, metadata[self.self._OBJECT_SIZE_KEY], len(data))) data = self._decrypt(data, metadata) data = self._uncompress(data, metadata) if len(data) != metadata[self._SIZE_KEY]: raise ValueError( 'Length mismatch of original data for object {}. Expected: {}, got: {}.' .format(key, metadata[self.self._SIZE_KEY], len(data))) data = data.decode('utf-8') return data def save_version(self, version_uid, data, overwrite=False): key = self._version_uid_to_key(version_uid) metadata_key = key + self._META_SUFFIX if not overwrite: try: self._read_object(key) except FileNotFoundError: pass else: raise FileExistsError( 'Version {} already exists in data backend.'.format( version_uid.readable)) data = data.encode('utf-8') size = len(data) data, metadata = self._compress(data) data, metadata_2 = self._encrypt(data) metadata.update(metadata_2) metadata[self._SIZE_KEY] = size metadata[self._OBJECT_SIZE_KEY] = len(data) metadata_json = json.dumps(metadata, separators=(',', ':')).encode('utf-8') try: self._write_object(key, data) self._write_object(metadata_key, metadata_json) except: try: self._rm_object(key) self._rm_object(metadata_key) except FileNotFoundError: pass raise if self._consistency_check_writes: self._check_write(key, metadata_key, data, metadata) def rm_version(self, version_uid): key = self._version_uid_to_key(version_uid) metadata_key = key + self._META_SUFFIX try: self._rm_object(key) finally: try: self._rm_object(metadata_key) except FileNotFoundError: pass def _encrypt(self, data): if self.active_encryption is not None: data, materials = self.active_encryption.encrypt(data=data) metadata = { self._ENCRYPTION_KEY: { 'identifier': self.active_encryption.identifier, 'type': self.active_encryption.NAME, 'materials': materials } } return data, metadata else: return data, {} def _decrypt(self, data, metadata): if self._ENCRYPTION_KEY in metadata: identifier = metadata[self._ENCRYPTION_KEY]['identifier'] type = metadata[self._ENCRYPTION_KEY]['type'] if identifier in self.encryption: encryption = self.encryption[identifier] if type != encryption.NAME: raise ConfigurationError( 'Mismatch between object encryption type and configured type for identifier ' + '{} ({} != {})'.format(identifier, type, encryption.NAME)) return encryption.decrypt( data=data, materials=metadata[self._ENCRYPTION_KEY]['materials']) else: raise IOError( 'Unknown encryption identifier {} in object metadata.'. format(identifier)) else: return data def _compress(self, data): self._compression_statistics['objects_considered'] += 1 self._compression_statistics['data_in'] += len(data) if self.active_compression is not None: compressed_data, materials = self.active_compression.compress( data=data) if len(compressed_data) < len(data): self._compression_statistics['objects_compressed'] += 1 self._compression_statistics['data_in_compression'] += len( data) self._compression_statistics['data_out_compression'] += len( compressed_data) self._compression_statistics['data_out'] += len( compressed_data) metadata = { self._COMPRESSION_KEY: { 'type': self.active_compression.NAME, 'materials': materials } } return compressed_data, metadata else: self._compression_statistics['data_out'] += len(data) return data, {} else: self._compression_statistics['data_out'] += len(data) return data, {} def _uncompress(self, data, metadata): if self._COMPRESSION_KEY in metadata: type = metadata[self._COMPRESSION_KEY]['type'] if type in self.compression: return self.compression[type].uncompress( data=data, materials=metadata[self._COMPRESSION_KEY]['materials'], original_size=metadata[self._SIZE_KEY]) else: raise IOError( 'Unsupported compression type {} in object metadata.'. format(type)) else: return data def wait_reads_finished(self): concurrent.futures.wait(self._read_futures) def wait_saves_finished(self): concurrent.futures.wait(self._write_futures) def use_read_cache(self, enable): return False def _log_compression_statistics(self): if self.active_compression is None or self._compression_statistics[ 'objects_considered'] == 0: return overall_ratio, ratio = 0.0, 0.0 if self._compression_statistics['data_out'] > 0: overall_ratio = self._compression_statistics[ 'data_in'] / self._compression_statistics['data_out'] if self._compression_statistics['data_out_compression'] > 0: ratio = self._compression_statistics['data_in_compression'] \ / self._compression_statistics['data_out_compression'] tbl = PrettyTable() tbl.field_names = [ 'Objects considered', 'Objects compressed', 'Data in', 'Data out', 'Overall compression ratio', 'Data input to compression', 'Data output from compression', 'Compression ratio' ] tbl.align['Objects considered'] = 'r' tbl.align['Objects compressed'] = 'r' tbl.align['Data in'] = 'r' tbl.align['Data out'] = 'r' tbl.align['Overall compression ratio'] = 'r' tbl.align['Data input to compression'] = 'r' tbl.align['Data output from compression'] = 'r' tbl.align['Compression ratio'] = 'r' tbl.add_row([ self._compression_statistics['objects_considered'], self._compression_statistics['objects_compressed'], self._compression_statistics['data_in'], self._compression_statistics['data_out'], '{:.2f}'.format(overall_ratio), self._compression_statistics['data_in_compression'], self._compression_statistics['data_out_compression'], '{:.2f}'.format(ratio) ]) logger.info('Compression statistics: \n' + textwrap.indent(str(tbl), ' ')) def close(self): self._log_compression_statistics() if len(self._read_futures) > 0: logger.warning( 'Data backend closed with {} outstanding read jobs, cancelling them.' .format(len(self._read_futures))) for future in self._read_futures: future.cancel() logger.debug('Data backend cancelled all outstanding read jobs.') # Get all jobs so that the semaphore gets released and still waiting jobs can complete for future in self.read_get_completed(): pass logger.debug( 'Data backend read results from all outstanding read jobs.') if len(self._write_futures) > 0: logger.warning( 'Data backend closed with {} outstanding write jobs, cancelling them.' .format(len(self._write_futures))) for future in self._write_futures: future.cancel() logger.debug('Data backend cancelled all outstanding write jobs.') # Write jobs release their semaphore at completion so we don't need to collect the results self._write_futures = [] self._write_executor.shutdown() self._read_executor.shutdown() def _block_uid_to_key(self, block_uid): key_name = '{:016x}-{:016x}'.format(block_uid.left, block_uid.right) digest = hashlib.md5(key_name.encode('ascii')).hexdigest() return '{}{}/{}/{}-{}'.format(self._BLOCKS_PREFIX, digest[0:2], digest[2:4], digest[:8], key_name) def _key_to_block_uid(self, key): bpl = len(self._BLOCKS_PREFIX) if len(key) != 48 + bpl: raise RuntimeError('Invalid key name {}'.format(key)) return BlockUid(int(key[15 + bpl:15 + bpl + 16], 16), int(key[32 + bpl:32 + bpl + 16], 16)) def _version_uid_to_key(self, version_uid): return '{}{}/{}/{}'.format(self._VERSIONS_PREFIX, version_uid.readable[-1:], version_uid.readable[-2:-1], version_uid.readable) def _key_to_version_uid(self, key): vpl = len(self._VERSIONS_PREFIX) vl = len(VersionUid(1).readable) if len(key) != vpl + vl + 4: raise RuntimeError('Invalid key name {}'.format(key)) return VersionUid.create_from_readables(key[vpl + 4:vpl + vl + 4]) @abstractmethod def _write_object(self, key, data): raise NotImplementedError @abstractmethod def _read_object(self, key): raise NotImplementedError @abstractmethod def _read_object_length(self, key): raise NotImplementedError @abstractmethod def _rm_object(self): raise NotImplementedError @abstractmethod def _rm_many_objects(self): raise NotImplementedError @abstractmethod def _list_objects(self): raise NotImplementedError