Exemple #1
0
def ceph_snapshot_create_post_success(sender: str, volume: str, pool: str, namespace: str, image: str, snapshot: str,
                                      context: Dict[str, Any]) -> None:
    assert isinstance(context, dict)
    pv_fsfreeze = context['pv-fsfreeze']
    if not pv_fsfreeze:
        return

    pv_host_ip = context['pv-host-ip']
    pv_fsfreeze_pod = context['pv-fsfreeze-pod']
    pv_mount_point = context['pv-mount-point']

    logger.info(f'Unfreezing filesystem {pv_mount_point} on host {pv_host_ip}.')

    service_account_namespace = benji.k8s_tools.kubernetes.service_account_namespace()
    for delay in FSFREEZE_UNFREEZE_TRIES:
        if delay > 0:
            time.sleep(delay)

        try:
            benji.k8s_tools.kubernetes.pod_exec(['fsfreeze', '--unfreeze', pv_mount_point],
                                                name=pv_fsfreeze_pod,
                                                namespace=service_account_namespace,
                                                container=FSFREEZE_CONTAINER_NAME,
                                                timeout=FSFREEZE_TIMEOUT)
        except Exception:
            pass
        else:
            logger.debug(f'Unfreezing filesystem succeeded.')
            break
    else:
        logger.error(f'Giving up on unfreezing filesystem {pv_mount_point} on host {pv_host_ip}.')
Exemple #2
0
def backup_initial(*,
                   volume: str,
                   pool: str,
                   namespace: str = '',
                   image: str,
                   version_labels: Dict[str, str],
                   version_uid: Optional[str],
                   source_compare: bool = False,
                   context: Any = None) -> Dict[str, str]:

    now = datetime.utcnow()
    snapshot = now.strftime(RBD_SNAP_NAME_PREFIX + '%Y-%m-%dT%H:%M:%SZ')
    image_path = _rbd_image_path(pool=pool, namespace=namespace, image=image)
    snapshot_path = _rbd_image_path(pool=pool,
                                    namespace=namespace,
                                    image=image,
                                    snapshot=snapshot)
    logger.info(f'Performing initial backup of {volume}:{image_path}')

    snapshot_create(volume=volume,
                    pool=pool,
                    namespace=namespace,
                    image=image,
                    snapshot=snapshot,
                    context=context)
    stdout = subprocess_run(
        ['rbd', 'diff', '--whole-object', '--format=json', snapshot_path])

    with NamedTemporaryFile(mode='w+', encoding='utf-8') as rbd_hints:
        assert isinstance(stdout, str)
        rbd_hints.write(stdout)
        rbd_hints.flush()
        benji_args = [
            'benji', '--machine-output', '--log-level', benji_log_level,
            'backup', '--snapshot', snapshot, '--rbd-hints', rbd_hints.name
        ]
        if version_uid is not None:
            benji_args.extend(['--uid', version_uid])
        for label_name, label_value in version_labels.items():
            benji_args.extend(['--label', f'{label_name}={label_value}'])
        benji_args.extend([f'{pool}:{snapshot_path}', volume])
        result = subprocess_run(benji_args, decode_json=True)
        assert isinstance(result, dict)

    if source_compare:
        # We won't evaluate the returned result but any failure will raise an exception.
        deep_scrub(pool=pool,
                   namespace=namespace,
                   image=image,
                   snapshot=snapshot,
                   version_uid=version_uid)

    return result
Exemple #3
0
def ceph_snapshot_create_pre(sender: str, volume: str, pool: str, namespace: str, image: str, snapshot: str,
                             context: Dict[str, Any]) -> None:
    assert isinstance(context, dict)
    assert 'pvc' in context
    pvc_namespace = context['pvc'].metadata.namespace
    pvc_name = context['pvc'].metadata.name
    pv_mount_point = context['pv-mount-point']

    if pv_mount_point is None:
        logger.warning(f'Mount path of PV is not known, skipping fsfreeze.')

    pv_fsfreeze, pv_host_ip, pv_fsfreeze_pod = _determine_fsfreeze_info(pvc_namespace, pvc_name, image)

    # Record for use in post signals
    context['pv-fsfreeze'] = pv_fsfreeze
    context['pv-host-ip'] = pv_host_ip
    context['pv-fsfreeze-pod'] = pv_fsfreeze_pod

    if not pv_fsfreeze:
        return
    if pv_host_ip is None:
        logger.info(f'PV is not mounted anywhere, skipping fsfreeze.')
        return
    if pv_fsfreeze_pod is None:
        logger.warning(f'No fsfreeze pod found for host {pv_host_ip}, skipping fsfreeze for this PV.')
        return

    logger.info(f'Freezing filesystem {pv_mount_point} on host {pv_host_ip} (pod {pv_fsfreeze_pod}).')

    service_account_namespace = benji.k8s_tools.kubernetes.service_account_namespace()
    try:
        benji.k8s_tools.kubernetes.pod_exec(['fsfreeze', '--freeze', pv_mount_point],
                                            name=pv_fsfreeze_pod,
                                            namespace=service_account_namespace,
                                            container=FSFREEZE_CONTAINER_NAME,
                                            timeout=FSFREEZE_TIMEOUT)
    except Exception as exception:
        # Try to unfreeze in any case
        try:
            benji.k8s_tools.kubernetes.pod_exec(['fsfreeze', '--unfreeze', pv_mount_point],
                                                name=pv_fsfreeze_pod,
                                                namespace=service_account_namespace,
                                                container=FSFREEZE_CONTAINER_NAME,
                                                timeout=FSFREEZE_TIMEOUT)
        except Exception as exception_2:
            raise exception_2 from exception
        else:
            raise exception

    logger.debug(f'Freezing filesystem succeeded.')
Exemple #4
0
def push(registry: CollectorRegistry, grouping_key: Dict[str, str]):
    if prom_push_gateway is not None and benji_instance is not None:
        logger.info(
            f'Pushing Prometheus metrics to gateway {prom_push_gateway}.')
        logger.debug(generate_latest(registry).decode('utf-8'))

        try:
            pushadd_to_gateway(prom_push_gateway,
                               job=benji_instance,
                               registry=registry,
                               grouping_key=grouping_key)
        except urllib.error.URLError as exception:
            logger.error(
                f'Pushing Prometheus metrics failed with a {type(exception).__name__} exception: {str(exception)}'
            )
            logger.error('Ignoring.')
Exemple #5
0
def deep_scrub(*,
               pool: str,
               namespace: str = '',
               image: str,
               snapshot: str,
               version_uid: Optional[str]) -> Dict[str, str]:
    snapshot_path = _rbd_image_path(pool=pool,
                                    namespace=namespace,
                                    image=image,
                                    snapshot=snapshot)
    logger.info(f'Comparing source {pool}:{snapshot_path} to {version_uid}.')

    benji_args = [
        'benji', '--machine-output', '--log-level', benji_log_level,
        'deep-scrub', '--source', f'{pool}:{snapshot_path}', version_uid
    ]

    result = subprocess_run(benji_args, decode_json=True)
    assert isinstance(result, dict)

    return result
Exemple #6
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, allow_abbrev=False)

    parser.add_argument('-f',
                        '--force',
                        dest='force',
                        action='store_true',
                        default=False,
                        help='Overwrite content of existing persistent volumes')
    parser.add_argument('--pvc-storage-class',
                        metavar='pvc_storage_class',
                        dest='pvc_storage_class',
                        default=None,
                        help='PVC storage class (only takes effect if the PVC does not exist yet)')
    parser.add_argument('--restore-url-template',
                        metavar='restore_url_template',
                        dest='restore_url_template',
                        help='Template to use for constructing URL for benji restore call',
                        default='rbd:{pool}/{namespace}/{image}')
    parser.add_argument(metavar='version_uid', dest='version_uid', help='Version uid')
    parser.add_argument(metavar='pvc_namespace', dest='pvc_namespace', help='PVC namespace')
    parser.add_argument(metavar='pvc_name', dest='pvc_name', help='PVC name')

    args = parser.parse_args()

    benji.k8s_tools.kubernetes.load_config()

    logger.info(f'Restoring version {args.version_uid} to PVC {args.pvc_namespace}/{args.pvc_name}.')

    benji_ls = subprocess_run(
        ['benji', '--machine-output', '--log-level', settings.benji_log_level, 'ls', f'uid == "{args.version_uid}"'],
        decode_json=True)
    assert isinstance(benji_ls, dict)
    assert 'versions' in benji_ls
    assert isinstance(benji_ls['versions'], list)

    if len(benji_ls['versions']) == 0:
        raise RuntimeError(f'Size of {args.version_uid} could not be determined.')

    assert isinstance(benji_ls['versions'][0], dict)
    assert isinstance(benji_ls['versions'][0]['size'], int)
    version_size = benji_ls['versions'][0]['size']

    # This assumes that the Kubernetes client has already been initialized
    core_v1_api = kubernetes.client.CoreV1Api()
    pvc = None
    try:
        pvc = core_v1_api.read_namespaced_persistent_volume_claim(args.pvc_name, args.pvc_namespace)
    except ApiException as exception:
        if exception.status != 404:
            raise RuntimeError(f'Unexpected Kubernetes API exception: {str(exception)}')

    if pvc is None:
        pvc = benji.k8s_tools.kubernetes.create_pvc(name=args.pvc_name,
                                                    namespace=args.pvc_namespace,
                                                    size=version_size,
                                                    storage_class=args.pvc_storage_class)
    else:
        if not args.force:
            raise RuntimeError('PVC already exists. Will not overwrite it unless forced.')

        # I don't really understand why capacity is a regular dict and not an object. Oh, well.
        pvc_size = int(benji.k8s_tools.kubernetes.parse_quantity(pvc.status.capacity['storage']))
        if pvc_size < version_size:
            raise RuntimeError(f'Existing PVC is too small to hold version {args.version_uid} ({pvc_size} < {version_size}).')
        elif pvc_size > version_size:
            logger.warning(f'Existing PVC is {pvc_size - version_size} bytes bigger than version {args.version_uid}.')

    polls = 0
    while polls < PVC_CREATION_MAX_POLLS:
        pvc = core_v1_api.read_namespaced_persistent_volume_claim(args.pvc_name, args.pvc_namespace)
        if pvc.status.phase == 'Bound':
            break
        time.sleep(PVC_CREATION_POLL_INTERVAL)
        polls += 1
        logger.info('Waiting for persistent volume creation... %d/%d', polls, PVC_CREATION_MAX_POLLS)
    if pvc.status.phase == 'Bound':
        logger.info('Persistent volume creation completed.')
    else:
        logger.error('Persistent volume creation did not complete after %d seconds.',
                     PVC_CREATION_MAX_POLLS * PVC_CREATION_POLL_INTERVAL)
        sys.exit(os.EX_CANTCREAT)

    pv = core_v1_api.read_persistent_volume(pvc.spec.volume_name)
    rbd_info = benji.k8s_tools.kubernetes.determine_rbd_info_from_pv(pv)
    if rbd_info is None:
        raise RuntimeError(f'Unable to determine RBD information for {pv.metadata.name}')

    print(
        subprocess_run([
            'benji',
            '--machine-output',
            '--log-level',
            settings.benji_log_level,
            'restore',
            '--sparse',
            '--force',
            args.version_uid,
            args.restore_url_template.format(pool=rbd_info.pool, namespace=rbd_info.namespace, image=rbd_info.image),
        ]))
    sys.exit(0)
Exemple #7
0
def main():
    # This arguments parser tries to mimic kubectl
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, allow_abbrev=False)

    parser.add_argument('-n',
                        '--namespace',
                        metavar='namespace',
                        dest='namespace',
                        default=None,
                        help='Filter on namespace')
    parser.add_argument('-l',
                        '--selector',
                        metavar='label-selector',
                        dest='labels',
                        action='append',
                        default=[],
                        help='Filter PVCs on label selector')
    parser.add_argument('--field-selector',
                        metavar='field-selector',
                        dest='fields',
                        action='append',
                        default=[],
                        help='Filter PVCs on field selector')
    parser.add_argument('--source-compare',
                        dest='source_compare',
                        action='store_true',
                        default=False,
                        help='Compare version to source after backup')

    args = parser.parse_args()

    benji.k8s_tools.kubernetes.load_config()
    core_v1_api = kubernetes.client.CoreV1Api()

    labels = ','.join(args.labels)
    fields = ','.join(args.fields)

    if args.namespace is not None:
        logger.info(f'Backing up all PVCs in namespace {args.namespace}.')
    else:
        logger.info(f'Backing up all PVCs in all namespaces.')
    if labels != '':
        logger.info(f'Matching label(s) {labels}.')
    if fields != '':
        logger.info(f'Matching field(s) {fields}.')

    if args.namespace is not None:
        pvcs = core_v1_api.list_namespaced_persistent_volume_claim(args.namespace,
                                                                   watch=False,
                                                                   label_selector=labels,
                                                                   field_selector=fields).items
    else:
        pvcs = core_v1_api.list_persistent_volume_claim_for_all_namespaces(watch=False,
                                                                           label_selector=labels,
                                                                           field_selector=fields).items
    if len(pvcs) == 0:
        logger.info('Not matching PVCs found.')
        sys.exit(0)

    for pvc in pvcs:
        if not hasattr(pvc.spec, 'volume_name') or pvc.spec.volume_name in (None, ''):
            continue

        pv = core_v1_api.read_persistent_volume(pvc.spec.volume_name)
        rbd_info = benji.k8s_tools.kubernetes.determine_rbd_info_from_pv(pv)
        if rbd_info is None:
            logger.debug(f'PersistentVolume {pv.metadata.name} is not an RBD backed volume '
                         f'or the volume format is unknown to us.')
            continue

        volume = f'{pvc.metadata.namespace}/{pvc.metadata.name}'
        # Limit the version_uid to 253 characters so that it is a compatible Kubernetes resource name.
        version_uid = '{}-{}'.format(f'{pvc.metadata.namespace}-{pvc.metadata.name}'[:246], _random_string(6))

        version_labels = {
            'benji-backup.me/instance': settings.benji_instance,
            'benji-backup.me/ceph-pool': rbd_info.pool,
            'benji-backup.me/ceph-namespace': rbd_info.namespace,
            'benji-backup.me/ceph-rbd-image': rbd_info.image,
            'benji-backup.me/k8s-pvc-namespace': pvc.metadata.namespace,
            'benji-backup.me/k8s-pvc': pvc.metadata.name,
            'benji-backup.me/k8s-pv': pv.metadata.name
        }

        context = {
            'pvc': pvc,
            'pv': pv,
            'pv-mount-point': rbd_info.mount_point,
        }
        ceph.backup(volume=volume,
                    pool=rbd_info.pool,
                    namespace=rbd_info.namespace,
                    image=rbd_info.image,
                    version_uid=version_uid,
                    version_labels=version_labels,
                    source_compare=args.source_compare,
                    context=context)

    sys.exit(0)
Exemple #8
0
def backup(*,
           volume: str,
           pool: str,
           namespace: str = '',
           image: str,
           version_labels: Dict[str, str] = {},
           version_uid: str = None,
           source_compare: bool = False,
           context: Any = None):
    signal_backup_pre.send(SIGNAL_SENDER,
                           volume=volume,
                           pool=pool,
                           namespace=namespace,
                           image=image,
                           version_labels=version_labels,
                           context=context)
    version = None
    try:
        image_path = _rbd_image_path(pool=pool,
                                     namespace=namespace,
                                     image=image)
        rbd_snap_ls = subprocess_run(
            ['rbd', 'snap', 'ls', '--format=json', image_path],
            decode_json=True)
        assert isinstance(rbd_snap_ls, list)
        # Snapshot are sorted by their ID, so newer snapshots come last
        benjis_snapshots = [
            snapshot['name'] for snapshot in rbd_snap_ls
            if snapshot['name'].startswith(RBD_SNAP_NAME_PREFIX)
        ]
        if len(benjis_snapshots) == 0:
            logger.info(
                'No previous RBD snapshot found, performing initial backup.')
            result = backup_initial(volume=volume,
                                    pool=pool,
                                    namespace=namespace,
                                    image=image,
                                    version_uid=version_uid,
                                    version_labels=version_labels,
                                    source_compare=source_compare,
                                    context=context)
        else:
            # Delete all snapshots except the newest
            for snapshot in benjis_snapshots[:-1]:
                snapshot_path = _rbd_image_path(pool=pool,
                                                namespace=namespace,
                                                image=image,
                                                snapshot=snapshot)
                logger.info(f'Deleting older RBD snapshot {snapshot_path}.')
                subprocess_run(
                    ['rbd', 'snap', 'rm', '--no-progress', snapshot_path])

            last_snapshot = benjis_snapshots[-1]
            last_snapshot_path = _rbd_image_path(pool=pool,
                                                 namespace=namespace,
                                                 image=image,
                                                 snapshot=last_snapshot)
            logger.info(f'Newest RBD snapshot is {last_snapshot_path}.')

            benji_ls = subprocess_run([
                'benji', '--machine-output', '--log-level', benji_log_level,
                'ls',
                f'volume == "{volume}" and snapshot == "{last_snapshot}" and status == "valid"'
            ],
                                      decode_json=True)
            assert isinstance(benji_ls, dict)
            assert 'versions' in benji_ls
            assert isinstance(benji_ls['versions'], list)
            if len(benji_ls['versions']) > 0:
                assert 'uid' in benji_ls['versions'][0]
                last_version_uid = benji_ls['versions'][0]['uid']
                assert isinstance(last_version_uid, str)
                result = backup_differential(volume=volume,
                                             pool=pool,
                                             namespace=namespace,
                                             image=image,
                                             last_snapshot=last_snapshot,
                                             last_version_uid=last_version_uid,
                                             version_uid=version_uid,
                                             version_labels=version_labels,
                                             source_compare=source_compare,
                                             context=context)
            else:
                logger.info(
                    f'Existing RBD snapshot {last_snapshot_path} not found in Benji, deleting it and reverting to initial backup.'
                )
                subprocess_run(
                    ['rbd', 'snap', 'rm', '--no-progress', last_snapshot_path])
                result = backup_initial(volume=volume,
                                        pool=pool,
                                        namespace=namespace,
                                        image=image,
                                        version_uid=version_uid,
                                        version_labels=version_labels,
                                        source_compare=source_compare,
                                        context=context)
        assert 'versions' in result and isinstance(result['versions'], list)
        version = result['versions'][0]
    except Exception as exception:
        signal_backup_post_error.send(SIGNAL_SENDER,
                                      volume=volume,
                                      pool=pool,
                                      namespace=namespace,
                                      image=image,
                                      version_labels=version_labels,
                                      context=context,
                                      version=version,
                                      exception=exception)
    else:
        signal_backup_post_success.send(SIGNAL_SENDER,
                                        volume=volume,
                                        pool=pool,
                                        namespace=namespace,
                                        image=image,
                                        version_labels=version_labels,
                                        context=context,
                                        version=version)