def __init__(self, config, in_memory=False): if not in_memory: self._engine = sqlalchemy.create_engine( config.get('metadataBackend.engine', types=str)) else: logger.info('Running in metadata-backend-less mode.') self._engine = sqlalchemy.create_engine('sqlite://')
def nbd(self, bind_address: str, bind_port: str, read_only: bool) -> None: with Benji(self.config) as benji_obj: store = BenjiStore(benji_obj) addr = (bind_address, bind_port) server = NbdServer(addr, store, read_only) logger.info("Starting to serve NBD on %s:%s" % (addr[0], addr[1])) server.serve_forever()
def nbd_client(self, version_uid): self.subprocess_run(args=[ 'sudo', 'nbd-client', '127.0.0.1', '-p', str(self.SERVER_PORT), '-l' ], success_regexp='^Negotiation: ..\n{}\n$'.format( version_uid[0].v_string)) version_uid, size = version_uid self.subprocess_run( args=[ 'sudo', 'nbd-client', '-N', version_uid.v_string, '127.0.0.1', '-p', str(self.SERVER_PORT), self.NBD_DEVICE ], success_regexp= '^Negotiation: ..size = \d+MB\nbs=1024, sz=\d+ bytes\n$|^Negotiation: ..size = \d+MB|Connected /dev/nbd\d+$' ) count = 0 nbd_data = bytearray() with open(self.NBD_DEVICE, 'rb') as f: while True: data = f.read(64 * 1024 + random.randint(0, 8192)) if not data: break count += len(data) nbd_data += data self.assertEqual(size, count) image_data = self.read_file(self.testpath.path + '/image') logger.info('image_data size {}, nbd_data size {}'.format( len(image_data), len(nbd_data))) self.assertEqual(image_data, bytes(nbd_data)) f = os.open(self.NBD_DEVICE, os.O_RDWR) for offset in range(0, size, 4096): os.lseek(f, offset, os.SEEK_SET) data = self.random_bytes(4096) written = os.write(f, data) os.fsync(f) self.assertEqual(len(data), written) # Discard cache so that the read request below really goes to the NBD server os.posix_fadvise(f, offset, len(data), os.POSIX_FADV_DONTNEED) os.lseek(f, offset, os.SEEK_SET) read_data = os.read(f, 4096) self.assertEqual(data, read_data) os.close(f) self.subprocess_run(args=['sudo', 'nbd-client', '-d', self.NBD_DEVICE], success_regexp='^disconnect, sock, done\n$') # Signal NBD server to stop self.nbd_server.stop()
def rest_api(self, bind_address: str, bind_port: int, threads: int) -> None: from benji.restapi import RestAPI api = RestAPI(self.config) logger.info( f'Starting REST API via gunicorn on {bind_address}:{bind_port}.') debug = bool(logger.isEnabledFor(logging.DEBUG)) api.run(bind_address=bind_address, bind_port=bind_port, threads=threads, debug=debug)
def nbd(self, bind_address, bind_port, read_only): benji_obj = None try: benji_obj = Benji(self.config) store = BenjiStore(benji_obj) addr = (bind_address, bind_port) server = NbdServer(addr, store, read_only) logger.info("Starting to serve nbd on %s:%s" % (addr[0], addr[1])) server.serve_forever() finally: if benji_obj: benji_obj.close()
def backup(self, version_name: str, snapshot_name: str, source: str, rbd_hints: str, base_version_uid: str, block_size: int, labels: List[str], storage) -> None: # Validate version_name and snapshot_name if not InputValidation.is_backup_name(version_name): raise benji.exception.UsageError( 'Version name {} is invalid.'.format(version_name)) if not InputValidation.is_snapshot_name(snapshot_name): raise benji.exception.UsageError( 'Snapshot name {} is invalid.'.format(snapshot_name)) base_version_uid_obj = VersionUid( base_version_uid) if base_version_uid else None if labels: label_add, label_remove = self._parse_labels(labels) if label_remove: raise benji.exception.UsageError( 'Wanting to delete labels on a new version is senseless.') benji_obj = None try: benji_obj = Benji(self.config, block_size=block_size) hints = None if rbd_hints: data = ''.join( [line for line in fileinput.input(rbd_hints).readline()]) hints = hints_from_rbd_diff(data) backup_version = benji_obj.backup(version_name, snapshot_name, source, hints, base_version_uid_obj, storage) if labels: for key, value in label_add: benji_obj.add_label(backup_version.uid, key, value) for key in label_remove: benji_obj.rm_label(backup_version.uid, key) if label_add: logger.info('Added label(s) to version {}: {}.'.format( backup_version.uid.v_string, ', '.join([ '{}={}'.format(name, value) for name, value in label_add ]))) if label_remove: logger.info('Removed label(s) from version {}: {}.'.format( backup_version.uid.v_string, ', '.join(label_remove))) if self.machine_output: benji_obj.export_any({'versions': [backup_version]}, sys.stdout, ignore_relationships=[((Version, ), ('blocks', ))]) finally: if benji_obj: benji_obj.close()
def version_info(self) -> None: if not self.machine_output: logger.info('Benji version: {}.'.format(__version__)) logger.info('Configuration version: {}, supported {}.'.format(VERSIONS.configuration.current, VERSIONS.configuration.supported)) logger.info('Metadata version: {}, supported {}.'.format(VERSIONS.database_metadata.current, VERSIONS.database_metadata.supported)) logger.info('Object metadata version: {}, supported {}.'.format(VERSIONS.object_metadata.current, VERSIONS.object_metadata.supported)) else: result = { 'version': __version__, 'configuration_version': { 'current': str(VERSIONS.configuration.current), 'supported': str(VERSIONS.configuration.supported) }, 'database_metadata_version': { 'current': str(VERSIONS.database_metadata.current), 'supported': str(VERSIONS.database_metadata.supported) }, 'object_metadata_version': { 'current': str(VERSIONS.object_metadata.current), 'supported': str(VERSIONS.object_metadata.supported) }, } print(json.dumps(result, indent=4))
def __init__(self, *, config: Config, name: str, module_configuration: ConfigDict) -> None: self._name = name self._active_transforms: List[TransformBase] = [] active_transforms = Config.get_from_dict(module_configuration, 'activeTransforms', None, types=list) if active_transforms is not None: for transform in active_transforms: self._active_transforms.append(TransformFactory.get_by_name(transform)) logger.info('Active transforms for storage {}: {}.'.format( name, ', '.join('{} ({})'.format(transform.name, transform.module) for transform in self._active_transforms))) simultaneous_writes = Config.get_from_dict(module_configuration, 'simultaneousWrites', types=int) simultaneous_reads = Config.get_from_dict(module_configuration, 'simultaneousReads', types=int) simultaneous_removals = Config.get_from_dict(module_configuration, 'simultaneousRemovals', types=int) bandwidth_read = Config.get_from_dict(module_configuration, 'bandwidthRead', types=int) bandwidth_write = Config.get_from_dict(module_configuration, 'bandwidthWrite', types=int) self._consistency_check_writes = Config.get_from_dict(module_configuration, 'consistencyCheckWrites', False, types=bool) hmac_key_encoded = Config.get_from_dict(module_configuration, 'hmac.key', None, types=str) hmac_key: Optional[bytes] = None if hmac_key_encoded is None: hmac_password = Config.get_from_dict(module_configuration, 'hmac.password', None, types=str) if hmac_password is not None: hmac_kdf_salt = base64.b64decode(Config.get_from_dict(module_configuration, 'hmac.kdfSalt', types=str)) hmac_kdf_iterations = Config.get_from_dict(module_configuration, 'hmac.kdfIterations', types=int) hmac_key = derive_key(salt=hmac_kdf_salt, iterations=hmac_kdf_iterations, key_length=32, password=hmac_password) else: hmac_key = base64.b64decode(hmac_key_encoded) self._dict_hmac: Optional[DictHMAC] = None if hmac_key is not None: logger.info('Enabling HMAC object metadata integrity protection for storage {}.'.format(name)) self._dict_hmac = DictHMAC(hmac_key=self._HMAC_KEY, secret_key=hmac_key) self.read_throttling = TokenBucket() self.read_throttling.set_rate(bandwidth_read) # 0 disables throttling self.write_throttling = TokenBucket() self.write_throttling.set_rate(bandwidth_write) # 0 disables throttling self._read_executor = JobExecutor(name='Storage-Read', workers=simultaneous_reads, blocking_submit=False) self._write_executor = JobExecutor(name='Storage-Write', workers=simultaneous_writes, blocking_submit=True) self._remove_executor = JobExecutor(name='Storage-Remove', workers=simultaneous_removals, blocking_submit=True)
def label(self, version_uid: str, labels: List[str]) -> None: version_uid_obj = VersionUid(version_uid) label_add, label_remove = InputValidation.parse_and_validate_labels( labels) with Benji(self.config) as benji_obj: for name, value in label_add: benji_obj.add_label(version_uid_obj, name, value) for name in label_remove: benji_obj.rm_label(version_uid_obj, name) if label_add: logger.info('Added label(s) to version {}: {}.'.format( version_uid_obj, ', '.join('{}={}'.format(name, value) for name, value in label_add))) if label_remove: logger.info('Removed label(s) from version {}: {}.'.format( version_uid_obj, ', '.join(label_remove)))
def backup(self, version_uid: str, volume: str, snapshot: str, source: str, rbd_hints: str, base_version_uid: str, block_size: int, labels: List[str], storage: str) -> None: if version_uid is None: version_uid = '{}-{}'.format(volume[:248], random_string(6)) version_uid_obj = VersionUid(version_uid) base_version_uid_obj = VersionUid( base_version_uid) if base_version_uid else None if labels: label_add, label_remove = InputValidation.parse_and_validate_labels( labels) with Benji(self.config) as benji_obj: hints = None if rbd_hints: logger.debug(f'Loading RBD hints from file {rbd_hints}.') with open(rbd_hints, 'r') as f: hints = hints_from_rbd_diff(f.read()) backup_version = benji_obj.backup( version_uid=version_uid_obj, volume=volume, snapshot=snapshot, source=source, hints=hints, base_version_uid=base_version_uid_obj, storage_name=storage, block_size=block_size) if labels: for key, value in label_add: benji_obj.add_label(backup_version.uid, key, value) for key in label_remove: benji_obj.rm_label(backup_version.uid, key) if label_add: logger.info('Added label(s) to version {}: {}.'.format( backup_version.uid, ', '.join('{}={}'.format(name, value) for name, value in label_add))) if label_remove: logger.info('Removed label(s) from version {}: {}.'.format( backup_version.uid, ', '.join(label_remove))) if self.machine_output: benji_obj.export_any({'versions': [backup_version]}, sys.stdout, ignore_relationships=(((Version, ), ('blocks', )), ))
def task_with_blocks(self, task: str, *, version_uid: str, blocks_done: int, blocks_count: int, per_thousand: int = 1000) -> None: log_every_blocks = max( 1, blocks_count // max(1, int(1000 / per_thousand))) if per_thousand == 1000 or blocks_done % log_every_blocks == 0 or blocks_done == 1 or blocks_done == blocks_count: message = '{} {}/{} blocks ({:.1f}%)'.format( task, blocks_done, blocks_count, blocks_done / blocks_count * 100) logger.info(message) self._setproctitle('{} - {}'.format(message, version_uid))
def test(self): benji_obj = self.benjiOpen() store = BenjiStore(benji_obj) addr = ('127.0.0.1', self.SERVER_PORT) read_only = False self.nbd_server = NbdServer(addr, store, read_only) logger.info("Starting to serve NBD on %s:%s" % (addr[0], addr[1])) self.subprocess_run(args=['sudo', 'modprobe', 'nbd']) self.nbd_client_thread = threading.Thread(target=self.nbd_client, daemon=True, args=(self.version_uid,)) self.nbd_client_thread.start() self.nbd_server.serve_forever() self.nbd_client_thread.join() self.assertEqual({self.version_uid[0], VersionUid(2)}, set([version.uid for version in benji_obj.ls()])) benji_obj.close()
def set_version(self, version_uid, *, valid=None, protected=None): try: version = self.get_version(version_uid) if valid is not None: version.valid = valid if protected is not None: version.protected = protected self._session.commit() if valid is not None: logger_func = logger.info if valid else logger.error logger_func('Marked version {} as {}.'.format( version_uid.readable, 'valid' if valid else 'invalid')) if protected is not None: logger.info('Marked version {} as {}.'.format( version_uid.readable, 'protected' if protected else 'unprotected')) except: self._session.rollback() raise
def label(self, version_uid: str, labels: List[str]) -> None: version_uid_obj = VersionUid(version_uid) label_add, label_remove = InputValidation.parse_and_validate_labels(labels) benji_obj = None try: benji_obj = Benji(self.config) for name, value in label_add: benji_obj.add_label(version_uid_obj, name, value) for name in label_remove: benji_obj.rm_label(version_uid_obj, name) if label_add: logger.info('Added label(s) to version {}: {}.'.format( version_uid_obj.v_string, ', '.join(['{}={}'.format(name, value) for name, value in label_add]))) if label_remove: logger.info('Removed label(s) from version {}: {}.'.format(version_uid_obj.v_string, ', '.join(label_remove))) finally: if benji_obj: benji_obj.close()
def _log_compression_statistics(self): if self.active_compression is None or self._compression_statistics[ 'objects_considered'] == 0: return overall_ratio, ratio = 0.0, 0.0 if self._compression_statistics['data_out'] > 0: overall_ratio = self._compression_statistics[ 'data_in'] / self._compression_statistics['data_out'] if self._compression_statistics['data_out_compression'] > 0: ratio = self._compression_statistics['data_in_compression'] \ / self._compression_statistics['data_out_compression'] tbl = PrettyTable() tbl.field_names = [ 'Objects considered', 'Objects compressed', 'Data in', 'Data out', 'Overall compression ratio', 'Data input to compression', 'Data output from compression', 'Compression ratio' ] tbl.align['Objects considered'] = 'r' tbl.align['Objects compressed'] = 'r' tbl.align['Data in'] = 'r' tbl.align['Data out'] = 'r' tbl.align['Overall compression ratio'] = 'r' tbl.align['Data input to compression'] = 'r' tbl.align['Data output from compression'] = 'r' tbl.align['Compression ratio'] = 'r' tbl.add_row([ self._compression_statistics['objects_considered'], self._compression_statistics['objects_compressed'], self._compression_statistics['data_in'], self._compression_statistics['data_out'], '{:.2f}'.format(overall_ratio), self._compression_statistics['data_in_compression'], self._compression_statistics['data_out_compression'], '{:.2f}'.format(ratio) ]) logger.info('Compression statistics: \n' + textwrap.indent(str(tbl), ' '))
def task_with_version(self, task: str, *, version_uid: str) -> None: logger.info(task) self._setproctitle('{} - {}'.format(task, version_uid))
def task(self, task: str): logger.info(task) self._setproctitle(task)
def _bulk_scrub(self, method, names, tags, version_percentage, block_percentage): if version_percentage: version_percentage = int(version_percentage) if block_percentage: block_percentage = int(block_percentage) history = BlockUidHistory() benji_obj = None try: benji_obj = Benji(self.config) versions = [] if names: for name in names: versions.extend( benji_obj.ls(version_name=name, version_tags=tags)) else: versions.extend(benji_obj.ls(version_tags=tags)) errors = [] if version_percentage and versions: # Will always scrub at least one matching version versions = random.sample( versions, max(1, int(len(versions) * version_percentage / 100))) if not versions: logger.info('No matching versions found.') for version in versions: try: logging.info('Scrubbing version {} with name {}.'.format( version.uid.readable, version.name)) getattr(benji_obj, method)(version.uid, block_percentage=block_percentage, history=history) except benji.exception.ScrubbingError as exception: logger.error(exception) errors.append(version) except: raise if errors: if self.machine_output: benji_obj.export_any( { 'versions': [ benji_obj.ls(version_uid=version.uid)[0] for version in versions ], 'errors': [ benji_obj.ls(version_uid=version.uid)[0] for version in errors ] }, sys.stdout, ignore_relationships=[((Version, ), ('blocks', ))]) raise benji.exception.ScrubbingError( 'One or more version had scrubbing errors: {}.'.format( ', '.join([version.uid.readable for version in errors]))) else: if self.machine_output: benji_obj.export_any( { 'versions': [ benji_obj.ls(version_uid=version.uid)[0] for version in versions ], 'errors': [] }, sys.stdout, ignore_relationships=[((Version, ), ('blocks', ))]) finally: if benji_obj: benji_obj.close()
def filter(self, versions): # Category labels without latest categories = [ category for category in self.rules.keys() if category != 'latest' ] for category in categories: setattr(self, '_{}_dict'.format(category), defaultdict(list)) # Make our own copy versions = list(versions) # Sort from youngest to oldest versions.sort(key=lambda version: version.date.timestamp(), reverse=True) # Remove latest versions from consideration if configured if 'latest' in self.rules: logger.debug('Keeping {} latest versions.'.format( self.rules['latest'])) del versions[:self.rules['latest']] dismissed_versions = [] for version in versions: if version.protected: logger.info( 'Not considering version {}, it is protected.'.format( version.uid.readable)) continue try: td = _Timedelta(version.date.timestamp(), self.reference_time) except _TimedeltaError as exception: # Err on the safe side, ignore this versions (i.e. it won't be dismissed) logger.warning('Version {}: {}'.format(version.uid.readable, exception)) continue logger.debug( 'Time and time delta for version {} are {} and {}.'.format( version.uid.readable, version.date, td)) for category in categories: timecount = getattr(td, category) if timecount <= self.rules[category]: logger.debug( 'Found matching category {}, timecount {}.'.format( category, timecount)) getattr( self, '_{}_dict'.format(category))[timecount].append(version) break else: # For loop did not break: The item doesn't fit into any category, # it's too old dismissed_versions.append(version) logger.debug( 'Dismissing version, it doesn\'t fit into any category.') for category in categories: category_dict = getattr(self, '_{}_dict'.format(category)) for timecount in category_dict: # Keep the oldest of each category, reject the rest dismissed_versions.extend(category_dict[timecount][:-1]) return dismissed_versions
def __init__(self, config): self.encryption = {} self.compression = {} self.active_encryption = None self.active_compression = None encryption_modules = config.get('dataBackend.encryption', None, types=list) if encryption_modules is not None: for encryption_module_dict in encryption_modules: type = config.get_from_dict(encryption_module_dict, 'type', types=str) identifier = config.get_from_dict(encryption_module_dict, 'identifier', types=str) materials = config.get_from_dict(encryption_module_dict, 'materials', types=dict) try: encryption_module = importlib.import_module('{}.{}'.format( self._ENCRYPTION_PACKAGE_PREFIX, type)) except ImportError: raise ConfigurationError( 'Module file {}.{} not found or related import error.'. format(self._ENCRYPTION_PACKAGE_PREFIX, type)) else: if type != encryption_module.Encryption.NAME: raise InternalError( 'Encryption module type and name don\'t agree ({} != {}).' .format(type, encryption_module.Encryption.NAME)) self.encryption[identifier] = encryption_module.Encryption( identifier=identifier, materials=materials) active_encryption = config.get( 'dataBackend.{}.activeEncryption'.format(self.NAME), None, types=str) if active_encryption is not None: if self.encryption and active_encryption in self.encryption: logger.info( 'Encryption is enabled for the {} data backend.'.format( self.NAME)) self.active_encryption = self.encryption[active_encryption] else: raise ConfigurationError( 'Encryption identifier {} is unknown.'.format( active_encryption)) compression_modules = config.get('dataBackend.compression', None, types=list) if compression_modules is not None: for compression_module_dict in compression_modules: type = config.get_from_dict(compression_module_dict, 'type', types=str) materials = config.get_from_dict(compression_module_dict, 'materials', None, types=dict) try: compression_module = importlib.import_module( '{}.{}'.format(self._COMPRESSION_PACKAGE_PREFIX, type)) except ImportError: raise ConfigurationError( 'Module file {}.{} not found or related import error.'. format(self._COMPRESSION_PACKAGE_PREFIX, type)) else: if type != compression_module.Compression.NAME: raise InternalError( 'Compression module type and name don\'t agree ({} != {}).' .format(type, compression_module.Compression.NAME)) self.compression[type] = compression_module.Compression( materials=materials) active_compression = config.get( 'dataBackend.{}.activeCompression'.format(self.NAME), None, types=str) if active_compression is not None: if self.compression and active_compression in self.compression: logger.info( 'Compression is enabled for the {} data backend.'.format( self.NAME)) self.active_compression = self.compression[active_compression] else: raise ConfigurationError( 'Compression type {} is unknown.'.format( active_compression)) simultaneous_writes = config.get('dataBackend.simultaneousWrites', types=int) simultaneous_reads = config.get('dataBackend.simultaneousReads', types=int) bandwidth_read = config.get('dataBackend.bandwidthRead', types=int) bandwidth_write = config.get('dataBackend.bandwidthWrite', types=int) self._consistency_check_writes = config.get( 'dataBackend.consistencyCheckWrites'.format(self.NAME), False, types=bool) self._compression_statistics = { 'objects_considered': 0, 'objects_compressed': 0, 'data_in': 0, 'data_out': 0, 'data_in_compression': 0, 'data_out_compression': 0 } self.read_throttling = TokenBucket() self.read_throttling.set_rate(bandwidth_read) # 0 disables throttling self.write_throttling = TokenBucket() self.write_throttling.set_rate( bandwidth_write) # 0 disables throttling self._read_executor = ThreadPoolExecutor( max_workers=simultaneous_reads, thread_name_prefix='DataBackend-Reader') self._read_futures = [] self._read_semaphore = BoundedSemaphore(simultaneous_reads + self.READ_QUEUE_LENGTH) self._write_executor = ThreadPoolExecutor( max_workers=simultaneous_writes, thread_name_prefix='DataBackend-Writer') self._write_futures = [] self._write_semaphore = BoundedSemaphore(simultaneous_writes + self.WRITE_QUEUE_LENGTH)
def get_delete_candidates(self, dt=3600): rounds = 0 false_positives_count = 0 hit_list_count = 0 one_hour_ago = datetime.datetime.utcnow() - datetime.timedelta( seconds=dt) while True: # http://stackoverflow.com/questions/7389759/memory-efficient-built-in-sqlalchemy-iterator-generator delete_candidates = self._session.query(DeletedBlock)\ .filter(DeletedBlock.date < one_hour_ago)\ .limit(250)\ .all() if not delete_candidates: break false_positives = set() hit_list = set() for candidate in delete_candidates: rounds += 1 if rounds % 1000 == 0: logger.info( "Cleanup-fast: {} false positives, {} data deletions.". format( false_positives_count, hit_list_count, )) block = self._session.query(Block)\ .filter(Block.uid == candidate.uid)\ .limit(1)\ .scalar() if block: false_positives.add(candidate.uid) false_positives_count += 1 else: hit_list.add(candidate.uid) hit_list_count += 1 if false_positives: logger.debug( "Cleanup-fast: Removing {} false positive from delete candidates." .format(len(false_positives))) self._session.query(DeletedBlock)\ .filter(DeletedBlock.uid.in_(false_positives))\ .delete(synchronize_session=False) if hit_list: logger.debug( "Cleanup-fast: {} delete candidates will be really deleted." .format(len(hit_list))) self._session.query(DeletedBlock).filter( DeletedBlock.uid.in_(hit_list)).delete( synchronize_session=False) yield (hit_list) self._session.commit() logger.info( "Cleanup-fast: Cleanup finished. {} false positives, {} data deletions." .format( false_positives_count, hit_list_count, ))