def _read(self, block: DereferencedBlock) -> Tuple[DereferencedBlock, bytes]: offset = block.idx * self.block_size t1 = time.time() ioctx = self._cluster.open_ioctx(self._pool_name) with rbd.Image(ioctx, self._image_name, self._snapshot_name, read_only=True) as image: # LIBRADOS_OP_FLAG_FADVISE_DONTNEED: Indicates read data will not be accessed in the near future (by anyone) # LIBRADOS_OP_FLAG_FADVISE_NOCACHE: Indicates read data will not be accessed again (by *this* client) data = image.read(offset, block.size, rados.LIBRADOS_OP_FLAG_FADVISE_NOCACHE) t2 = time.time() if not data: raise EOFError( 'End of file reached on {} when there should be data.'.format( self.url)) logger.debug('{} read block {} in {:.3f}s'.format( threading.current_thread().name, block.idx, t2 - t1, )) return block, data
def _read(self, block: DereferencedBlock) -> Tuple[DereferencedBlock, bytes]: assert block.size == self.block_size lba = (block.id * self.block_size) // self._iscsi_block_size num_blocks = self.block_size // self._iscsi_block_size if lba >= self._iscsi_num_blocks: raise RuntimeError( 'Attempt to read outside of the device. Requested LBA is {}, but device has only {} blocks. (1)' .format(lba, self._iscsi_num_blocks)) if lba + num_blocks > self._iscsi_num_blocks: raise RuntimeError( 'Attempt to read outside of the device. Requested LBA is {}, but device has only {} blocks. (2)' .format(lba + num_blocks, self._iscsi_num_blocks)) t1 = time.time() task = self._iscsi_execute_sync('READ(16)', libiscsi.iscsi_read16_sync, self._iscsi_context, self._iscsi_lun, lba, self.block_size, self._iscsi_block_size, 0, 0, 0, 0, 0) data = task.datain assert len(data) == self.block_size t2 = time.time() logger.debug('{} read block {} in {:.3f}s'.format( threading.current_thread().name, block.id, t2 - t1, )) return block, data
def _write(self, block: DereferencedBlock, data: bytes) -> DereferencedBlock: assert block.size == self.block_size lba = (block.id * self.block_size) // self._iscsi_block_size num_blocks = self.block_size // self._iscsi_block_size if lba >= self._iscsi_num_blocks: raise RuntimeError( 'Attempt to write outside of the device. Requested LBA is {}, but device has only {} blocks. (1)' .format(lba, self._iscsi_num_blocks)) if lba + num_blocks > self._iscsi_num_blocks: raise RuntimeError( 'Attempt to write outside of the device. Requested LBA is {}, but device has only {} blocks. (2)' .format(lba + num_blocks, self._iscsi_num_blocks)) t1 = time.time() self._iscsi_execute_sync('WRITE(16)', libiscsi.iscsi_write16_sync, self._iscsi_context, self._iscsi_lun, lba, data, self._iscsi_block_size, 0, 0, 0, 0, 0) t2 = time.time() logger.debug('{} wrote block {} in {:.3f}s'.format( threading.current_thread().name, block.id, t2 - t1, )) return block
def open_r(self) -> None: re_match = re.match('^([^/]+)/(?:([^/]*)/)?([^@]+)(?:@(.+))?$', self.parsed_url.path) if not re_match: raise UsageError( 'URL {} is invalid . Need {}:<pool>[/<namespace>]/<imagename>[@<snapshotname>].' .format(self.url, self.name)) self._pool_name, self._namespace_name, self._image_name, self._snapshot_name = re_match.groups( ) # try opening it and quit if that's not possible. try: ioctx = self._cluster.open_ioctx(self._pool_name) if self._namespace_name is not None and len( self._namespace_name) > 0: logger.debug( f'Configuring io context to use namespace {self._namespace_name}.' ) ioctx.set_namespace(self._namespace_name) except rados.ObjectNotFound: raise FileNotFoundError('Ceph pool {} not found.'.format( self._pool_name)) from None try: self._rbd_image = rbd.Image(ioctx, self._image_name, self._snapshot_name, read_only=True) except rbd.ImageNotFound: raise FileNotFoundError( 'RBD image or snapshot {} not found.'.format( self.url)) from None
def _read(self, block: DereferencedBlock) -> Tuple[DereferencedBlock, bytes]: offset = block.id * self._block_size t1 = time.time() ioctx = self._cluster.open_ioctx(self._pool_name) with rbd.Image(ioctx, self._image_name, self._snapshot_name, read_only=True) as image: data = image.read(offset, block.size, rados.LIBRADOS_OP_FLAG_FADVISE_DONTNEED) t2 = time.time() if not data: raise EOFError( 'End of file reached on {} when there should be data.'.format( self.url)) logger.debug('{} read block {} in {:.2f}s'.format( threading.current_thread().name, block.id, t2 - t1, )) return block, data
def _write(self, block: DereferencedBlock, data: bytes) -> DereferencedBlock: data, transforms_metadata = self._encapsulate(data) metadata, metadata_json = self._build_metadata( size=block.size, object_size=len(data), checksum=block.checksum, transforms_metadata=transforms_metadata) key = block.uid.storage_object_to_path() metadata_key = key + self._META_SUFFIX time.sleep(self.write_throttling.consume(len(data) + len(metadata_json))) t1 = time.time() try: self._write_object(key, data) self._write_object(metadata_key, metadata_json) except: try: self._rm_object(key) self._rm_object(metadata_key) except FileNotFoundError: pass raise t2 = time.time() logger.debug('{} wrote data of uid {} in {:.2f}s'.format(threading.current_thread().name, block.uid, t2 - t1)) if self._consistency_check_writes: try: self._check_write(key=key, metadata_key=metadata_key, data_expected=data) except (KeyError, ValueError) as exception: raise InvalidBlockException('Check write of block {} (UID {}) failed.'.format(block.id, block.uid), block) from exception return block
def __init__(self, rules_spec, reference_time=None): self.reference_time = time.time( ) if reference_time is None else reference_time self.rules = self._parse_rules(rules_spec) logger.debug( 'Retention filter set up with reference time {} and rules {}'. format(self.reference_time, self.rules))
def __init__(self, *, config: Config, name: str, storage_id: int, module_configuration: ConfigDict) -> None: read_cache_directory = Config.get_from_dict(module_configuration, 'readCache.directory', None, types=str) read_cache_maximum_size = Config.get_from_dict(module_configuration, 'readCache.maximumSize', None, types=int) read_cache_shards = Config.get_from_dict(module_configuration, 'readCache.shards', None, types=int) if read_cache_directory and read_cache_maximum_size: os.makedirs(read_cache_directory, exist_ok=True) try: self._read_cache = FanoutCache( read_cache_directory, size_limit=read_cache_maximum_size, shards=read_cache_shards, eviction_policy='least-frequently-used', statistics=1, ) except Exception: logger.warning('Unable to enable disk based read caching. Continuing without it.') self._read_cache = None else: logger.debug('Disk based read caching instantiated (cache size {}, shards {}).'.format( read_cache_maximum_size, read_cache_shards)) else: self._read_cache = None self._use_read_cache = True # Start reader and write threads after the disk cached is created, so that they see it. super().__init__(config=config, name=name, storage_id=storage_id, module_configuration=module_configuration)
def test_storage_stats(self): NUM_BLOBS = 15 BLOB_SIZE = 4096 saved_uids = list(self.storage.list_blocks()) self.assertEqual(0, len(saved_uids)) blocks = [ Block(uid=BlockUid(i + 1, i + 100), size=BLOB_SIZE, checksum='0000000000000000') for i in range(NUM_BLOBS) ] for block in blocks: data = self.random_bytes(BLOB_SIZE) self.assertEqual(BLOB_SIZE, len(data)) self.storage.write_block(block, data) objects_count, objects_size = self.storage.storage_stats() logger.debug( f'Storage stats: {objects_count} objects using {objects_size} bytes.' ) self.assertEqual(NUM_BLOBS * 2, objects_count) # Also counts the metadata objects self.assertGreater(objects_size, 0) for block in blocks: self.storage.rm_block(block.uid)
def __init__(self, hash_function_config: str) -> None: hash_args: Optional[str] = None try: hash_name, hash_args = hash_function_config.split(',', 1) except ValueError: hash_name = hash_function_config try: hash_module: Any = import_module('{}.{}'.format(self._CRYPTO_PACKAGE, hash_name)) except ImportError as exception: raise ConfigurationError('Unsupported block hash {}.'.format(hash_name)) from exception hash_kwargs: Dict[str, Any] = {} if hash_args is not None: hash_kwargs = {k: literal_eval(v) for k, v in (pair.split('=') for pair in hash_args.split(','))} try: hash = hash_module.new(**hash_kwargs) except (TypeError, ValueError) as exception: raise ConfigurationError('Unsupported or invalid block hash arguments: {}.'.format(hash_kwargs)) from exception from benji.database import Block if len(hash.digest()) > Block.MAXIMUM_CHECKSUM_LENGTH: raise ConfigurationError('Specified block hash {} exceeds maximum digest length of {} bytes.'.format( hash_name, Block.MAXIMUM_CHECKSUM_LENGTH)) logger.debug('Using block hash {} with kwargs {}.'.format(hash_name, hash_kwargs)) self._hash_module = hash_module self._hash_kwargs = hash_kwargs
def close(self) -> None: super().close() if self._read_cache is not None: (cache_hits, cache_misses) = self._read_cache.stats() logger.debug('Disk based cache statistics (since cache creation): {} hits, {} misses.'.format( cache_hits, cache_misses)) self._read_cache.close()
def test_version_filter_issue_9_slowness(self): version_uids = set() for i in range(3): version = self.database_backend.create_version( version_name='backup-name', snapshot_name='snapshot-name.{}'.format(i), size=16 * 1024 * 4096, storage_id=1, block_size=4 * 1024 * 4096) self.assertNotIn(version.uid, version_uids) version_uids.add(version.uid) t1 = timeit.timeit( lambda: self.database_backend.get_versions_with_filter( 'snapshot_name == "snapshot-name.2" and name == "backup-name"' ), number=1) t2 = timeit.timeit( lambda: self.database_backend.get_versions_with_filter( '(snapshot_name == "snapshot-name.2" and name == "backup-name")' ), number=1) logger.debug( 'test_version_filter_issue_9_slowness: t1 {}, t2 {}'.format( t1, t2)) self.assertLess(t1 - t2, 5)
def parametrized_hash_function(config_hash_function): hash_name = None hash_args = None try: hash_name, hash_args = config_hash_function.split(',', 1) except ValueError: hash_name = config_hash_function hash_function = getattr(hashlib, hash_name) if hash_function is None: raise ConfigurationError( 'Unsupported hash function {}.'.format(hash_name)) kwargs = {} if hash_args is not None: kwargs = dict( (k, literal_eval(v)) for k, v in (pair.split('=') for pair in hash_args.split(','))) logger.debug('Using hash function {} with kwargs {}'.format( hash_name, kwargs)) hash_function_w_kwargs = hash_function(**kwargs) from benji.metadata import Block if len(hash_function_w_kwargs.digest()) > Block.MAXIMUM_CHECKSUM_LENGTH: raise ConfigurationError( 'Specified hash function exceeds maximum digest length of {}.'. format(Block.MAXIMUM_CHECKSUM_LENGTH)) return hash_function_w_kwargs
def write_get_completed(self, timeout: Optional[int] = None) -> Iterator[Union[DereferencedBlock, BaseException]]: try: while not self._writes_finished(): logger.debug('Write queue length, outstanding writes, completion queue length: {}, {}, {}.'.format( len(self._write_queue), self._outstanding_aio_writes, self._write_completion_queue.qsize())) self._submit_aio_writes() completion, t1, t2, block = self._write_completion_queue.get(block=(timeout is None or timeout != 0), timeout=timeout) assert self._outstanding_aio_writes > 0 self._outstanding_aio_writes -= 1 try: completion.wait_for_complete_and_cb() except Exception as exception: yield exception else: write_return_value = completion.get_return_value() if write_return_value != 0: raise IOError('Write of block {} failed.'.format(block.idx)) logger.debug('Wrote block {} in {:.3f}s'.format(block.idx, t2 - t1)) yield block self._write_completion_queue.task_done() except queue.Empty: return else: return
def open_w(self, size: int, force: bool = False, sparse: bool = False) -> None: self._write_executor = JobExecutor(name='IO-Write', workers=self._simultaneous_writes, blocking_submit=True) re_match = re.match('^([^/]+)/([^@]+)$', self.parsed_url.path) if not re_match: raise UsageError( 'URL {} is invalid . Need {}:<pool>/<imagename>.'.format( self.url, self.name)) self._pool_name, self._image_name = re_match.groups() # try opening it and quit if that's not possible. try: ioctx = self._cluster.open_ioctx(self._pool_name) except rados.ObjectNotFound: raise FileNotFoundError('Ceph pool {} not found.'.format( self._pool_name)) from None try: image = rbd.Image(ioctx, self._image_name) except rbd.ImageNotFound: rbd.RBD().create(ioctx, self._image_name, size, old_format=False, features=self._new_image_features) rbd.Image(ioctx, self._image_name) else: try: if not force: raise FileExistsError( 'RBD image {} already exists. Force the restore if you want to overwrite it.' .format(self.url)) else: image_size = image.size() if size > image_size: raise IOError( 'RBD image {} is too small. Its size is {} bytes, but we need {} bytes for the restore.' .format(self.url, image_size, size)) # If this is an existing image and sparse is true discard all objects from this image # RBD discard only supports a maximum region length of 0x7fffffff. if sparse: logger.debug( 'Discarding all objects of RBD image {}.'.format( self.url)) region_start = 0 bytes_to_end = image_size while bytes_to_end > 0: region_length = min(0x7fffffff, bytes_to_end) image.discard(region_start, region_length) region_start += region_length bytes_to_end -= region_length finally: image.close()
def _init_connection(self) -> None: if not hasattr(self._local, 'session'): logger.debug('Initializing S3 session and resource for {}'.format(threading.current_thread().name)) self._local.session = boto3.session.Session() if self._disable_encoding_type: self._local.session.events.unregister('before-parameter-build.s3.ListObjects', set_list_objects_encoding_type_url) self._local.resource = self._local.session.resource('s3', **self._resource_config) self._local.bucket = self._local.resource.Bucket(self._bucket_name)
def write_sync(self, block: Union[DereferencedBlock, Block], data: bytes) -> None: assert self._rbd_image is not None offset = block.idx * self.block_size t1 = time.time() written = self._rbd_image.write(data, offset, rados.LIBRADOS_OP_FLAG_FADVISE_DONTNEED) t2 = time.time() logger.debug('Wrote block {} in {:.3f}s'.format(block.idx, t2 - t1)) assert written == block.size
def __init__(self, ad_hoc_config: str = None, sources: Sequence[str] = None) -> None: if ad_hoc_config is None: if not sources: sources = self._get_sources() config = None for source in sources: if os.path.isfile(source): try: with open(source, 'r') as f: config = ruamel.yaml.load( f, Loader=ruamel.yaml.SafeLoader) except Exception as exception: raise ConfigurationError( 'Configuration file {} is invalid.'.format( source)) from exception if config is None: raise ConfigurationError( 'Configuration file {} is empty.'.format(source)) break if not config: raise ConfigurationError( 'No configuration file found in the default places ({}).'. format(', '.join(sources))) else: config = ruamel.yaml.load(ad_hoc_config, Loader=ruamel.yaml.SafeLoader) if config is None: raise ConfigurationError('Configuration string is empty.') if self._CONFIGURATION_VERSION_KEY not in config: raise ConfigurationError( 'Configuration is missing required key "{}".'.format( self._CONFIGURATION_VERSION_KEY)) version = str(config[self._CONFIGURATION_VERSION_KEY]) if not re.fullmatch(self._CONFIGURATION_VERSION_REGEX, version): raise ConfigurationError( 'Configuration has invalid version of "{}".'.format(version)) version_obj = semantic_version.Version.coerce(version) if version_obj not in VERSIONS.configuration.supported: raise ConfigurationError( 'Configuration has unsupported version of "{}".'.format( version)) self._config_version = version_obj self._config = ConfigDict(self.validate(module=__name__, config=config)) logger.debug('Loaded configuration.')
def shutdown(self) -> None: if len(self._futures) > 0: logger.warning('Job executor "{}" is being shutdown with {} outstanding jobs, cancelling them.'.format( self._name, len(self._futures))) for future in self._futures: future.cancel() logger.debug('Job executor "{}" cancelled all outstanding jobs.'.format(self._name)) if not self._blocking_submit: # Get all jobs so that the semaphore gets released and still waiting jobs can complete for _ in self.get_completed(): pass logger.debug('Job executor "{}" read results for all outstanding jobs.'.format(self._name)) self._executor.shutdown()
def read_sync(self, block: Union[DereferencedBlock, Block]) -> bytes: assert self._rbd_image is not None offset = block.idx * self.block_size t1 = time.time() data = self._rbd_image.read(offset, block.size, rados.LIBRADOS_OP_FLAG_FADVISE_DONTNEED) t2 = time.time() if not data: raise EOFError('End of file reached on {} when there should be data.'.format(self.url)) logger.debug('Read block {} in {:.3f}s'.format(block.idx, t2 - t1)) return data
def read_get_completed( self, timeout: Optional[int] = None ) -> Iterator[Union[Tuple[DereferencedBlock, bytes], BaseException]]: try: while not self._reads_finished(): logger.debug( 'Read queue length, outstanding reads, completion queue length: {}, {}, {}.' .format(len(self._read_queue), self._outstanding_aio_reads, self._read_completion_queue.qsize())) self._submit_aio_reads() completion, t1, t2, block, data = self._read_completion_queue.get( block=True if timeout is None or timeout != 0 else False, timeout=timeout) assert self._outstanding_aio_reads > 0 self._outstanding_aio_reads -= 1 try: completion.wait_for_complete_and_cb() except Exception as exception: yield exception else: read_return_value = completion.get_return_value() if read_return_value < 0: raise IOError('Read of block {} failed.'.format( block.id)) if read_return_value != block.size: raise IOError( 'Short read of block {}. Wanted {} bytes but got {}.' .format(block.id, block.size, read_return_value)) if not data: # We shouldn't get here because a failed read should be caught by the "read_return_value < 0" # check above. See: https://github.com/ceph/ceph/blob/880468b4bf6f0a1995de5bd98c09007a00222cbf/src/pybind/rbd/rbd.pyx#L4145. raise IOError('Read of block {} failed.'.format( block.id)) logger.debug('Read block {} in {:.3f}s'.format( block.id, t2 - t1)) yield block, data self._read_completion_queue.task_done() except queue.Empty: return else: return
def filter(self, versions: Sequence[Version]) -> List[Version]: # Category labels without latest categories = [ category for category in self.rules.keys() if category != 'latest' ] for category in categories: setattr(self, '_{}_dict'.format(category), defaultdict(list)) # Make our own copy versions = list(versions) # Sort from youngest to oldest versions.sort(key=lambda version: version.date.timestamp(), reverse=True) # Remove latest versions from consideration if configured if 'latest' in self.rules: logger.debug('Keeping {} latest versions.'.format( self.rules['latest'])) del versions[:self.rules['latest']] dismissed_versions = [] for version in versions: try: td = _Timedelta(version.date.timestamp(), self.reference_time) except _TimedeltaError as exception: # Err on the safe side, ignore this versions (i.e. it won't be dismissed) logger.warning('Version {}: {}'.format(version.uid.v_string, exception)) continue logger.debug( 'Time and time delta for version {} are {} and {}.'.format( version.uid.v_string, version.date, td)) for category in categories: timecount = getattr(td, category) if timecount <= self.rules[category]: logger.debug( 'Found matching category {}, timecount {}.'.format( category, timecount)) getattr( self, '_{}_dict'.format(category))[timecount].append(version) break else: # For loop did not break: The item doesn't fit into any category, # it's too old dismissed_versions.append(version) logger.debug( 'Dismissing version, it doesn\'t fit into any category.') for category in categories: category_dict = getattr(self, '_{}_dict'.format(category)) for timecount in category_dict: # Keep the oldest of each category, reject the rest dismissed_versions.extend(category_dict[timecount][:-1]) return dismissed_versions
def _resolve_schema(self, *, name: str) -> Dict: try: child = self._schema_registry[name] except KeyError: raise InternalError('Schema for module {} is missing.'.format(name)) result: Dict = {} if self._PARENTS_KEY in child: parent_names = child[self._PARENTS_KEY] for parent_name in parent_names: parent = self._resolve_schema(name=parent_name) self._merge_dicts(result, parent) result = self._merge_dicts(result, child) if self._PARENTS_KEY in result: del result[self._PARENTS_KEY] logger.debug('Resolved schema for {}: {}.'.format(name, result)) return result
def _write(self, block: DereferencedBlock, data: bytes) -> DereferencedBlock: offset = block.id * self.block_size t1 = time.time() ioctx = self._cluster.open_ioctx(self._pool_name) with rbd.Image(ioctx, self._image_name, self._snapshot_name) as image: written = image.write(data, offset, rados.LIBRADOS_OP_FLAG_FADVISE_DONTNEED) t2 = time.time() logger.debug('{} wrote block {} in {:.3f}s'.format( threading.current_thread().name, block.id, t2 - t1, )) assert written == len(data) return block
def backup(self, version_uid: str, volume: str, snapshot: str, source: str, rbd_hints: str, base_version_uid: str, block_size: int, labels: List[str], storage: str) -> None: if version_uid is None: version_uid = '{}-{}'.format(volume[:248], random_string(6)) version_uid_obj = VersionUid(version_uid) base_version_uid_obj = VersionUid( base_version_uid) if base_version_uid else None if labels: label_add, label_remove = InputValidation.parse_and_validate_labels( labels) with Benji(self.config) as benji_obj: hints = None if rbd_hints: logger.debug(f'Loading RBD hints from file {rbd_hints}.') with open(rbd_hints, 'r') as f: hints = hints_from_rbd_diff(f.read()) backup_version = benji_obj.backup( version_uid=version_uid_obj, volume=volume, snapshot=snapshot, source=source, hints=hints, base_version_uid=base_version_uid_obj, storage_name=storage, block_size=block_size) if labels: for key, value in label_add: benji_obj.add_label(backup_version.uid, key, value) for key in label_remove: benji_obj.rm_label(backup_version.uid, key) if label_add: logger.info('Added label(s) to version {}: {}.'.format( backup_version.uid, ', '.join('{}={}'.format(name, value) for name, value in label_add))) if label_remove: logger.info('Removed label(s) from version {}: {}.'.format( backup_version.uid, ', '.join(label_remove))) if self.machine_output: benji_obj.export_any({'versions': [backup_version]}, sys.stdout, ignore_relationships=(((Version, ), ('blocks', )), ))
def close(self): """ Close the io """ if self._read_executor: if len(self._read_futures) > 0: logger.warning( 'IO backend closed with {} outstanding read jobs, cancelling them.' .format(len(self._read_futures))) for future in self._read_futures: future.cancel() logger.debug('IO backend cancelled all outstanding read jobs.') # Get all jobs so that the semaphore gets released and still waiting jobs can complete for future in self.read_get_completed(): pass logger.debug( 'IO backend read results from all outstanding read jobs.') self._read_executor.shutdown()
def _write(self, block: DereferencedBlock, data: bytes) -> DereferencedBlock: offset = block.id * self._block_size t1 = time.time() with open(self._path, 'rb+') as f: f.seek(offset) written = f.write(data) os.posix_fadvise(f.fileno(), offset, len(data), os.POSIX_FADV_DONTNEED) t2 = time.time() logger.debug('{} wrote block {} in {:.2f}s'.format( threading.current_thread().name, block.id, t2 - t1, )) assert written == len(data) return block
def _read( self, block: DereferencedBlock, metadata_only: bool ) -> Tuple[DereferencedBlock, Optional[bytes], Dict]: key = block.uid.storage_object_to_path() metadata_key = key + self._META_SUFFIX data: Optional[bytes] = None try: t1 = time.time() if not metadata_only: data = self._read_object(key) data_length = len(data) else: data_length = self._read_object_length(key) metadata_json = self._read_object(metadata_key) time.sleep( self.read_throttling.consume( len(data) if data else 0 + len(metadata_json))) t2 = time.time() except FileNotFoundError as exception: raise InvalidBlockException( 'Object metadata or data of block {} (UID{}) not found.'. format(block.id, block.uid), block) from exception try: metadata = self._decode_metadata(metadata_json=metadata_json, key=key, data_length=data_length) except (KeyError, ValueError) as exception: raise InvalidBlockException( 'Object metadata of block {} (UID{}) is invalid.'.format( block.id, block.uid), block) from exception if self._CHECKSUM_KEY not in metadata: raise InvalidBlockException( 'Required object metadata key {} is missing for block {} (UID {}).' .format(self._CHECKSUM_KEY, block.id, block.uid), block) if not metadata_only and self._TRANSFORMS_KEY in metadata: data = self._decapsulate( data, metadata[self._TRANSFORMS_KEY]) # type: ignore logger.debug('{} read data of uid {} in {:.2f}s{}'.format( threading.current_thread().name, block.uid, t2 - t1, ' (metadata only)' if metadata_only else '')) return block, data, metadata
def __init__(self, cfg=None, sources=None, merge_defaults=True): yaml = YAML(typ='safe', pure=True) default_config = yaml.load(self.DEFAULT_CONFIG) if cfg is None: if not sources: sources = self._get_sources() config = None for source in sources: if os.path.isfile(source): try: config = yaml.load(Path(source)) except Exception as exception: raise ConfigurationError('Configuration file {} is invalud.'.format(source)) from exception if config is None: raise ConfigurationError('Configuration file {} is empty.'.format(source)) break if not config: raise ConfigurationError('No configuration file found in the default places ({}).'.format( ', '.join(sources))) else: config = yaml.load(cfg) if config is None: raise ConfigurationError('Configuration string is empty.') if 'configurationVersion' not in config or type(config['configurationVersion']) is not str: raise ConfigurationError('Configuration version is missing or not a string.') if config['configurationVersion'] != self.CONFIG_VERSION: raise ConfigurationError('Unknown configuration version {}.'.format(config['configurationVersion'])) if merge_defaults: self._merge_dicts(config, default_config) with StringIO() as redacted_config_string: redacted_config = yaml.load(self.REDACT) self._merge_dicts(redacted_config, config) yaml.dump(redacted_config, redacted_config_string) logger.debug('Loaded configuration: {}'.format(redacted_config_string.getvalue())) self.config = config
def _read(self, block: DereferencedBlock) -> Tuple[DereferencedBlock, bytes]: offset = block.id * self._block_size t1 = time.time() with open(self._path, 'rb') as f: f.seek(offset) data = f.read(block.size) os.posix_fadvise(f.fileno(), offset, block.size, os.POSIX_FADV_DONTNEED) t2 = time.time() if not data: raise EOFError('End of file reached on {} when there should be data.'.format(self.url)) logger.debug('{} read block {} in {:.2f}s'.format( threading.current_thread().name, block.id, t2 - t1, )) return block, data