def ceph_snapshot_create_post_success(sender: str, volume: str, pool: str, namespace: str, image: str, snapshot: str, context: Dict[str, Any]) -> None: assert isinstance(context, dict) pv_fsfreeze = context['pv-fsfreeze'] if not pv_fsfreeze: return pv_host_ip = context['pv-host-ip'] pv_fsfreeze_pod = context['pv-fsfreeze-pod'] pv_mount_point = context['pv-mount-point'] logger.info(f'Unfreezing filesystem {pv_mount_point} on host {pv_host_ip}.') service_account_namespace = benji.k8s_tools.kubernetes.service_account_namespace() for delay in FSFREEZE_UNFREEZE_TRIES: if delay > 0: time.sleep(delay) try: benji.k8s_tools.kubernetes.pod_exec(['fsfreeze', '--unfreeze', pv_mount_point], name=pv_fsfreeze_pod, namespace=service_account_namespace, container=FSFREEZE_CONTAINER_NAME, timeout=FSFREEZE_TIMEOUT) except Exception: pass else: logger.debug(f'Unfreezing filesystem succeeded.') break else: logger.error(f'Giving up on unfreezing filesystem {pv_mount_point} on host {pv_host_ip}.')
def backup_initial(*, volume: str, pool: str, namespace: str = '', image: str, version_labels: Dict[str, str], version_uid: Optional[str], source_compare: bool = False, context: Any = None) -> Dict[str, str]: now = datetime.utcnow() snapshot = now.strftime(RBD_SNAP_NAME_PREFIX + '%Y-%m-%dT%H:%M:%SZ') image_path = _rbd_image_path(pool=pool, namespace=namespace, image=image) snapshot_path = _rbd_image_path(pool=pool, namespace=namespace, image=image, snapshot=snapshot) logger.info(f'Performing initial backup of {volume}:{image_path}') snapshot_create(volume=volume, pool=pool, namespace=namespace, image=image, snapshot=snapshot, context=context) stdout = subprocess_run( ['rbd', 'diff', '--whole-object', '--format=json', snapshot_path]) with NamedTemporaryFile(mode='w+', encoding='utf-8') as rbd_hints: assert isinstance(stdout, str) rbd_hints.write(stdout) rbd_hints.flush() benji_args = [ 'benji', '--machine-output', '--log-level', benji_log_level, 'backup', '--snapshot', snapshot, '--rbd-hints', rbd_hints.name ] if version_uid is not None: benji_args.extend(['--uid', version_uid]) for label_name, label_value in version_labels.items(): benji_args.extend(['--label', f'{label_name}={label_value}']) benji_args.extend([f'{pool}:{snapshot_path}', volume]) result = subprocess_run(benji_args, decode_json=True) assert isinstance(result, dict) if source_compare: # We won't evaluate the returned result but any failure will raise an exception. deep_scrub(pool=pool, namespace=namespace, image=image, snapshot=snapshot, version_uid=version_uid) return result
def ceph_snapshot_create_pre(sender: str, volume: str, pool: str, namespace: str, image: str, snapshot: str, context: Dict[str, Any]) -> None: assert isinstance(context, dict) assert 'pvc' in context pvc_namespace = context['pvc'].metadata.namespace pvc_name = context['pvc'].metadata.name pv_mount_point = context['pv-mount-point'] if pv_mount_point is None: logger.warning(f'Mount path of PV is not known, skipping fsfreeze.') pv_fsfreeze, pv_host_ip, pv_fsfreeze_pod = _determine_fsfreeze_info(pvc_namespace, pvc_name, image) # Record for use in post signals context['pv-fsfreeze'] = pv_fsfreeze context['pv-host-ip'] = pv_host_ip context['pv-fsfreeze-pod'] = pv_fsfreeze_pod if not pv_fsfreeze: return if pv_host_ip is None: logger.info(f'PV is not mounted anywhere, skipping fsfreeze.') return if pv_fsfreeze_pod is None: logger.warning(f'No fsfreeze pod found for host {pv_host_ip}, skipping fsfreeze for this PV.') return logger.info(f'Freezing filesystem {pv_mount_point} on host {pv_host_ip} (pod {pv_fsfreeze_pod}).') service_account_namespace = benji.k8s_tools.kubernetes.service_account_namespace() try: benji.k8s_tools.kubernetes.pod_exec(['fsfreeze', '--freeze', pv_mount_point], name=pv_fsfreeze_pod, namespace=service_account_namespace, container=FSFREEZE_CONTAINER_NAME, timeout=FSFREEZE_TIMEOUT) except Exception as exception: # Try to unfreeze in any case try: benji.k8s_tools.kubernetes.pod_exec(['fsfreeze', '--unfreeze', pv_mount_point], name=pv_fsfreeze_pod, namespace=service_account_namespace, container=FSFREEZE_CONTAINER_NAME, timeout=FSFREEZE_TIMEOUT) except Exception as exception_2: raise exception_2 from exception else: raise exception logger.debug(f'Freezing filesystem succeeded.')
def push(registry: CollectorRegistry, grouping_key: Dict[str, str]): if prom_push_gateway is not None and benji_instance is not None: logger.info( f'Pushing Prometheus metrics to gateway {prom_push_gateway}.') logger.debug(generate_latest(registry).decode('utf-8')) try: pushadd_to_gateway(prom_push_gateway, job=benji_instance, registry=registry, grouping_key=grouping_key) except urllib.error.URLError as exception: logger.error( f'Pushing Prometheus metrics failed with a {type(exception).__name__} exception: {str(exception)}' ) logger.error('Ignoring.')
def deep_scrub(*, pool: str, namespace: str = '', image: str, snapshot: str, version_uid: Optional[str]) -> Dict[str, str]: snapshot_path = _rbd_image_path(pool=pool, namespace=namespace, image=image, snapshot=snapshot) logger.info(f'Comparing source {pool}:{snapshot_path} to {version_uid}.') benji_args = [ 'benji', '--machine-output', '--log-level', benji_log_level, 'deep-scrub', '--source', f'{pool}:{snapshot_path}', version_uid ] result = subprocess_run(benji_args, decode_json=True) assert isinstance(result, dict) return result
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, allow_abbrev=False) parser.add_argument('-f', '--force', dest='force', action='store_true', default=False, help='Overwrite content of existing persistent volumes') parser.add_argument('--pvc-storage-class', metavar='pvc_storage_class', dest='pvc_storage_class', default=None, help='PVC storage class (only takes effect if the PVC does not exist yet)') parser.add_argument('--restore-url-template', metavar='restore_url_template', dest='restore_url_template', help='Template to use for constructing URL for benji restore call', default='rbd:{pool}/{namespace}/{image}') parser.add_argument(metavar='version_uid', dest='version_uid', help='Version uid') parser.add_argument(metavar='pvc_namespace', dest='pvc_namespace', help='PVC namespace') parser.add_argument(metavar='pvc_name', dest='pvc_name', help='PVC name') args = parser.parse_args() benji.k8s_tools.kubernetes.load_config() logger.info(f'Restoring version {args.version_uid} to PVC {args.pvc_namespace}/{args.pvc_name}.') benji_ls = subprocess_run( ['benji', '--machine-output', '--log-level', settings.benji_log_level, 'ls', f'uid == "{args.version_uid}"'], decode_json=True) assert isinstance(benji_ls, dict) assert 'versions' in benji_ls assert isinstance(benji_ls['versions'], list) if len(benji_ls['versions']) == 0: raise RuntimeError(f'Size of {args.version_uid} could not be determined.') assert isinstance(benji_ls['versions'][0], dict) assert isinstance(benji_ls['versions'][0]['size'], int) version_size = benji_ls['versions'][0]['size'] # This assumes that the Kubernetes client has already been initialized core_v1_api = kubernetes.client.CoreV1Api() pvc = None try: pvc = core_v1_api.read_namespaced_persistent_volume_claim(args.pvc_name, args.pvc_namespace) except ApiException as exception: if exception.status != 404: raise RuntimeError(f'Unexpected Kubernetes API exception: {str(exception)}') if pvc is None: pvc = benji.k8s_tools.kubernetes.create_pvc(name=args.pvc_name, namespace=args.pvc_namespace, size=version_size, storage_class=args.pvc_storage_class) else: if not args.force: raise RuntimeError('PVC already exists. Will not overwrite it unless forced.') # I don't really understand why capacity is a regular dict and not an object. Oh, well. pvc_size = int(benji.k8s_tools.kubernetes.parse_quantity(pvc.status.capacity['storage'])) if pvc_size < version_size: raise RuntimeError(f'Existing PVC is too small to hold version {args.version_uid} ({pvc_size} < {version_size}).') elif pvc_size > version_size: logger.warning(f'Existing PVC is {pvc_size - version_size} bytes bigger than version {args.version_uid}.') polls = 0 while polls < PVC_CREATION_MAX_POLLS: pvc = core_v1_api.read_namespaced_persistent_volume_claim(args.pvc_name, args.pvc_namespace) if pvc.status.phase == 'Bound': break time.sleep(PVC_CREATION_POLL_INTERVAL) polls += 1 logger.info('Waiting for persistent volume creation... %d/%d', polls, PVC_CREATION_MAX_POLLS) if pvc.status.phase == 'Bound': logger.info('Persistent volume creation completed.') else: logger.error('Persistent volume creation did not complete after %d seconds.', PVC_CREATION_MAX_POLLS * PVC_CREATION_POLL_INTERVAL) sys.exit(os.EX_CANTCREAT) pv = core_v1_api.read_persistent_volume(pvc.spec.volume_name) rbd_info = benji.k8s_tools.kubernetes.determine_rbd_info_from_pv(pv) if rbd_info is None: raise RuntimeError(f'Unable to determine RBD information for {pv.metadata.name}') print( subprocess_run([ 'benji', '--machine-output', '--log-level', settings.benji_log_level, 'restore', '--sparse', '--force', args.version_uid, args.restore_url_template.format(pool=rbd_info.pool, namespace=rbd_info.namespace, image=rbd_info.image), ])) sys.exit(0)
def main(): # This arguments parser tries to mimic kubectl parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, allow_abbrev=False) parser.add_argument('-n', '--namespace', metavar='namespace', dest='namespace', default=None, help='Filter on namespace') parser.add_argument('-l', '--selector', metavar='label-selector', dest='labels', action='append', default=[], help='Filter PVCs on label selector') parser.add_argument('--field-selector', metavar='field-selector', dest='fields', action='append', default=[], help='Filter PVCs on field selector') parser.add_argument('--source-compare', dest='source_compare', action='store_true', default=False, help='Compare version to source after backup') args = parser.parse_args() benji.k8s_tools.kubernetes.load_config() core_v1_api = kubernetes.client.CoreV1Api() labels = ','.join(args.labels) fields = ','.join(args.fields) if args.namespace is not None: logger.info(f'Backing up all PVCs in namespace {args.namespace}.') else: logger.info(f'Backing up all PVCs in all namespaces.') if labels != '': logger.info(f'Matching label(s) {labels}.') if fields != '': logger.info(f'Matching field(s) {fields}.') if args.namespace is not None: pvcs = core_v1_api.list_namespaced_persistent_volume_claim(args.namespace, watch=False, label_selector=labels, field_selector=fields).items else: pvcs = core_v1_api.list_persistent_volume_claim_for_all_namespaces(watch=False, label_selector=labels, field_selector=fields).items if len(pvcs) == 0: logger.info('Not matching PVCs found.') sys.exit(0) for pvc in pvcs: if not hasattr(pvc.spec, 'volume_name') or pvc.spec.volume_name in (None, ''): continue pv = core_v1_api.read_persistent_volume(pvc.spec.volume_name) rbd_info = benji.k8s_tools.kubernetes.determine_rbd_info_from_pv(pv) if rbd_info is None: logger.debug(f'PersistentVolume {pv.metadata.name} is not an RBD backed volume ' f'or the volume format is unknown to us.') continue volume = f'{pvc.metadata.namespace}/{pvc.metadata.name}' # Limit the version_uid to 253 characters so that it is a compatible Kubernetes resource name. version_uid = '{}-{}'.format(f'{pvc.metadata.namespace}-{pvc.metadata.name}'[:246], _random_string(6)) version_labels = { 'benji-backup.me/instance': settings.benji_instance, 'benji-backup.me/ceph-pool': rbd_info.pool, 'benji-backup.me/ceph-namespace': rbd_info.namespace, 'benji-backup.me/ceph-rbd-image': rbd_info.image, 'benji-backup.me/k8s-pvc-namespace': pvc.metadata.namespace, 'benji-backup.me/k8s-pvc': pvc.metadata.name, 'benji-backup.me/k8s-pv': pv.metadata.name } context = { 'pvc': pvc, 'pv': pv, 'pv-mount-point': rbd_info.mount_point, } ceph.backup(volume=volume, pool=rbd_info.pool, namespace=rbd_info.namespace, image=rbd_info.image, version_uid=version_uid, version_labels=version_labels, source_compare=args.source_compare, context=context) sys.exit(0)
def backup(*, volume: str, pool: str, namespace: str = '', image: str, version_labels: Dict[str, str] = {}, version_uid: str = None, source_compare: bool = False, context: Any = None): signal_backup_pre.send(SIGNAL_SENDER, volume=volume, pool=pool, namespace=namespace, image=image, version_labels=version_labels, context=context) version = None try: image_path = _rbd_image_path(pool=pool, namespace=namespace, image=image) rbd_snap_ls = subprocess_run( ['rbd', 'snap', 'ls', '--format=json', image_path], decode_json=True) assert isinstance(rbd_snap_ls, list) # Snapshot are sorted by their ID, so newer snapshots come last benjis_snapshots = [ snapshot['name'] for snapshot in rbd_snap_ls if snapshot['name'].startswith(RBD_SNAP_NAME_PREFIX) ] if len(benjis_snapshots) == 0: logger.info( 'No previous RBD snapshot found, performing initial backup.') result = backup_initial(volume=volume, pool=pool, namespace=namespace, image=image, version_uid=version_uid, version_labels=version_labels, source_compare=source_compare, context=context) else: # Delete all snapshots except the newest for snapshot in benjis_snapshots[:-1]: snapshot_path = _rbd_image_path(pool=pool, namespace=namespace, image=image, snapshot=snapshot) logger.info(f'Deleting older RBD snapshot {snapshot_path}.') subprocess_run( ['rbd', 'snap', 'rm', '--no-progress', snapshot_path]) last_snapshot = benjis_snapshots[-1] last_snapshot_path = _rbd_image_path(pool=pool, namespace=namespace, image=image, snapshot=last_snapshot) logger.info(f'Newest RBD snapshot is {last_snapshot_path}.') benji_ls = subprocess_run([ 'benji', '--machine-output', '--log-level', benji_log_level, 'ls', f'volume == "{volume}" and snapshot == "{last_snapshot}" and status == "valid"' ], decode_json=True) assert isinstance(benji_ls, dict) assert 'versions' in benji_ls assert isinstance(benji_ls['versions'], list) if len(benji_ls['versions']) > 0: assert 'uid' in benji_ls['versions'][0] last_version_uid = benji_ls['versions'][0]['uid'] assert isinstance(last_version_uid, str) result = backup_differential(volume=volume, pool=pool, namespace=namespace, image=image, last_snapshot=last_snapshot, last_version_uid=last_version_uid, version_uid=version_uid, version_labels=version_labels, source_compare=source_compare, context=context) else: logger.info( f'Existing RBD snapshot {last_snapshot_path} not found in Benji, deleting it and reverting to initial backup.' ) subprocess_run( ['rbd', 'snap', 'rm', '--no-progress', last_snapshot_path]) result = backup_initial(volume=volume, pool=pool, namespace=namespace, image=image, version_uid=version_uid, version_labels=version_labels, source_compare=source_compare, context=context) assert 'versions' in result and isinstance(result['versions'], list) version = result['versions'][0] except Exception as exception: signal_backup_post_error.send(SIGNAL_SENDER, volume=volume, pool=pool, namespace=namespace, image=image, version_labels=version_labels, context=context, version=version, exception=exception) else: signal_backup_post_success.send(SIGNAL_SENDER, volume=volume, pool=pool, namespace=namespace, image=image, version_labels=version_labels, context=context, version=version)