Esempio n. 1
0
def delete_backup(config, backup_name, all_nodes):
    backups_to_purge = list()
    monitoring = Monitoring(config=config.monitoring)

    try:
        storage = Storage(config=config.storage)
        cluster_backup = storage.get_cluster_backup(backup_name)
        backups_to_purge = cluster_backup.node_backups.values()

        if not all_nodes:
            backups_to_purge = [
                nb for nb in backups_to_purge if storage.config.fqdn in nb.fqdn
            ]

        logging.info('Deleting Backup {}...'.format(backup_name))
        purge_backups(storage, backups_to_purge)

        logging.debug('Emitting metrics')
        tags = ['medusa-node-backup', 'delete-error', 'DELETE-ERROR']
        monitoring.send(tags, 0)
    except Exception as e:
        tags = ['medusa-node-backup', 'delete-error', 'DELETE-ERROR']
        monitoring.send(tags, 1)
        medusa.utils.handle_exception(
            e,
            'This error happened during the delete of backup "{}": {}'.format(
                backup_name, str(e)), config)
Esempio n. 2
0
def verify(config, backup_name):
    storage = Storage(config=config.storage)

    try:
        cluster_backup = storage.get_cluster_backup(backup_name)
    except KeyError:
        logging.error('No such backup')
        raise RuntimeError("Manifest validation failed")

    print('Validating {0.name} ...'.format(cluster_backup))

    if cluster_backup.is_complete():
        print('- Completion: OK!')
    else:
        print('- Completion: Not complete!')
        for incomplete_node in cluster_backup.incomplete_nodes():
            print(
                '  - [{0.fqdn}] Backup started at {0.started}, but not finished yet'
                .format(incomplete_node))
        for fqdn in cluster_backup.missing_nodes():
            print('  - [{}] Backup missing'.format(fqdn))

    consistency_errors = [
        consistency_error
        for node_backup in cluster_backup.node_backups.values()
        for consistency_error in validate_manifest(storage, node_backup)
    ]

    if consistency_errors:
        print("- Manifest validation: Failed!")
        for error in consistency_errors:
            print(error)
        raise RuntimeError("Manifest validation failed")
    else:
        print("- Manifest validated: OK!!")
def orchestrate(config, backup_name, seed_target, temp_dir, host_list, keep_auth, bypass_checks,
                verify, keyspaces, tables, parallel_restores, use_sstableloader=False):
    monitoring = Monitoring(config=config.monitoring)
    try:
        restore_start_time = datetime.datetime.now()
        if seed_target is None and host_list is None:
            # if no target node is provided, nor a host list file, default to the local node as seed target
            hostname_resolver = HostnameResolver(medusa.utils.evaluate_boolean(config.cassandra.resolve_ip_addresses))
            seed_target = hostname_resolver.resolve_fqdn(socket.gethostbyname(socket.getfqdn()))
            logging.warning("Seed target was not provided, using the local hostname: {}".format(seed_target))

        if seed_target is not None and host_list is not None:
            err_msg = 'You must either provide a seed target or a list of host, not both'
            logging.error(err_msg)
            raise Exception(err_msg)

        if not temp_dir.is_dir():
            err_msg = '{} is not a directory'.format(temp_dir)
            logging.error(err_msg)
            raise Exception(err_msg)

        storage = Storage(config=config.storage)

        try:
            cluster_backup = storage.get_cluster_backup(backup_name)
        except KeyError:
            err_msg = 'No such backup --> {}'.format(backup_name)
            logging.error(err_msg)
            raise Exception(err_msg)

        restore = RestoreJob(cluster_backup, config, temp_dir, host_list, seed_target, keep_auth, verify,
                             parallel_restores, keyspaces, tables, bypass_checks, use_sstableloader)
        restore.execute()

        restore_end_time = datetime.datetime.now()
        restore_duration = restore_end_time - restore_start_time

        logging.debug('Emitting metrics')

        logging.info('Restore duration: {}'.format(restore_duration.seconds))
        tags = ['medusa-cluster-restore', 'restore-duration', backup_name]
        monitoring.send(tags, restore_duration.seconds)

        tags = ['medusa-cluster-restore', 'restore-error', backup_name]
        monitoring.send(tags, 0)

        logging.debug('Done emitting metrics')
        logging.info('Successfully restored the cluster')

    except Exception as e:
        tags = ['medusa-cluster-restore', 'restore-error', backup_name]
        monitoring.send(tags, 1)

        logging.error('This error happened during the cluster restore: {}'.format(str(e)))
        traceback.print_exc()
        sys.exit(1)
Esempio n. 4
0
def main(config, backup_name):
    storage = Storage(config=config.storage)
    backup = storage.get_cluster_backup(backup_name)
    if not backup:
        logging.error('No such backup')
        sys.exit(1)

    for hostname, ringitem in backup.tokenmap.items():
        print(hostname)
        print(ringitem['tokens'])
Esempio n. 5
0
def status(config, backup_name):
    storage = Storage(config=config.storage)

    try:
        cluster_backup = storage.get_cluster_backup(backup_name)
    except KeyError:
        logging.error('No such backup')
        sys.exit(1)

    if cluster_backup.is_complete():
        print('{.name}'.format(cluster_backup))
    else:
        print('{.name} [Incomplete!]'.format(cluster_backup))

    started = datetime.fromtimestamp(
        cluster_backup.started).strftime(TIMESTAMP_FORMAT)
    if cluster_backup.finished is None:
        print('- Started: {}, ' 'Finished: never'.format(started))
    else:
        finished = datetime.fromtimestamp(
            cluster_backup.finished).strftime(TIMESTAMP_FORMAT)
        print('- Started: {}, ' 'Finished: {}'.format(started, finished))

    complete_nodes = cluster_backup.complete_nodes()
    incomplete_nodes = cluster_backup.incomplete_nodes()
    missing_nodes = cluster_backup.missing_nodes()
    print('- {0} nodes completed, {1} nodes incomplete, {2} nodes missing'.
          format(len(complete_nodes), len(incomplete_nodes),
                 len(missing_nodes)))

    if len(incomplete_nodes) > 0:
        print('- Incomplete nodes:')
        for node_backup in incomplete_nodes:
            print('    {}'.format(node_backup.fqdn))

    if len(missing_nodes) > 0:
        print('- Missing nodes:')
        for fqdn in missing_nodes:
            print('    {}'.format(fqdn))

    print('- {} files, {}'.format(cluster_backup.num_objects(),
                                  format_bytes_str(cluster_backup.size())))
def orchestrate(config,
                backup_name,
                seed_target,
                temp_dir,
                host_list,
                keep_auth,
                bypass_checks,
                verify,
                keyspaces,
                tables,
                pssh_pool_size,
                use_sstableloader=False):
    monitoring = Monitoring(config=config.monitoring)
    try:
        restore_start_time = datetime.datetime.now()
        if seed_target is not None:
            keep_auth = False

        if seed_target is None and host_list is None:
            err_msg = 'You must either provide a seed target or a list of host'
            logging.error(err_msg)
            raise Exception(err_msg)

        if seed_target is not None and host_list is not None:
            err_msg = 'You must either provide a seed target or a list of host, not both'
            logging.error(err_msg)
            raise Exception(err_msg)

        if not temp_dir.is_dir():
            err_msg = '{} is not a directory'.format(temp_dir)
            logging.error(err_msg)
            raise Exception(err_msg)

        if keep_auth:
            logging.info(
                'system_auth keyspace will be left untouched on the target nodes'
            )
        else:
            logging.info(
                'system_auth keyspace will be overwritten with the backup on target nodes'
            )

        storage = Storage(config=config.storage)

        try:
            cluster_backup = storage.get_cluster_backup(backup_name)
        except KeyError:
            err_msg = 'No such backup --> {}'.format(backup_name)
            logging.error(err_msg)
            raise Exception(err_msg)

        restore = RestoreJob(cluster_backup, config, temp_dir, host_list,
                             seed_target, keep_auth, verify, pssh_pool_size,
                             keyspaces, tables, bypass_checks,
                             use_sstableloader)
        restore.execute()

        restore_end_time = datetime.datetime.now()
        restore_duration = restore_end_time - restore_start_time

        logging.debug('Emitting metrics')

        logging.info('Restore duration: {}'.format(restore_duration.seconds))
        tags = ['medusa-cluster-restore', 'restore-duration', backup_name]
        monitoring.send(tags, restore_duration.seconds)

        tags = ['medusa-cluster-restore', 'restore-error', backup_name]
        monitoring.send(tags, 0)

        logging.debug('Done emitting metrics')
        logging.info('Successfully restored the cluster')

    except Exception as e:
        tags = ['medusa-cluster-restore', 'restore-error', backup_name]
        monitoring.send(tags, 1)

        logging.error(
            'This error happened during the cluster restore: {}'.format(
                str(e)))
        traceback.print_exc()
        sys.exit(1)
Esempio n. 7
0
class MedusaService(medusa_pb2_grpc.MedusaServicer):
    def __init__(self, config):
        logging.info("Init service")
        self.config = config
        self.storage = Storage(config=self.config.storage)

    def Backup(self, request, context):
        logging.info("Performing backup {}".format(request.name))
        resp = medusa_pb2.BackupResponse()
        # TODO pass the staggered and mode args
        try:
            medusa.backup_node.main(self.config, request.name, None, False,
                                    "differential")
        except Exception as e:
            context.set_details("failed to create backups: {}".format(e))
            context.set_code(grpc.StatusCode.INTERNAL)
            logging.exception("backup failed")

        return resp

    def BackupStatus(self, request, context):
        response = medusa_pb2.BackupStatusResponse()
        try:
            backup = self.storage.get_cluster_backup(request.backupName)

            # TODO how is the startTime determined?
            response.startTime = datetime.fromtimestamp(
                backup.started).strftime(TIMESTAMP_FORMAT)
            response.finishedNodes = [
                node.fqdn for node in backup.complete_nodes()
            ]
            response.unfinishedNodes = [
                node.fqdn for node in backup.incomplete_nodes()
            ]
            response.missingNodes = [
                node.fqdn for node in backup.missing_nodes()
            ]

            if backup.finished:
                response.finishTime = datetime.fromtimestamp(
                    backup.finished).strftime(TIMESTAMP_FORMAT)
            else:
                response.finishTime = ""

            return response
        except KeyError:
            context.set_details("backup <{}> does not exist".format(
                request.backupName))
            context.set_code(grpc.StatusCode.NOT_FOUND)
            return response

    def GetBackups(self, request, context):
        response = medusa_pb2.GetBackupsResponse()
        try:
            backups = medusa.listing.get_backups(self.config, True)
            for backup in backups:
                summary = medusa_pb2.BackupSummary()
                summary.backupName = backup.name
                if backup.started is None:
                    summary.starTime = 0
                else:
                    summary.startTime = backup.started
                if backup.finished is None:
                    summary.finishTime = 0
                else:
                    summary.finishTime = backup.finished
                summary.totalNodes = len(backup.tokenmap)
                summary.finishedNodes = len(backup.complete_nodes())
                for node in backup.tokenmap:
                    tokenmapNode = medusa_pb2.BackupNode()
                    tokenmapNode.host = node
                    tokenmapNode.datacenter = backup.tokenmap[node][
                        "dc"] if "dc" in backup.tokenmap[node] else ""
                    tokenmapNode.rack = backup.tokenmap[node][
                        "rack"] if "rack" in backup.tokenmap[node] else ""
                    if "tokens" in backup.tokenmap[node]:
                        for token in backup.tokenmap[node]["tokens"]:
                            tokenmapNode.tokens.append(token)
                    summary.nodes.append(tokenmapNode)
                response.backups.append(summary)

            return response
        except Exception as e:
            context.set_details("failed to get backups: {}".format(e))
            context.set_code(grpc.StatusCode.INTERNAL)
            return response

    def DeleteBackup(self, request, context):
        logging.info("Deleting backup {}".format(request.name))
        resp = medusa_pb2.DeleteBackupResponse()
        try:
            medusa.purge.delete_backup(self.config, request.name, True)
        except Exception as e:
            context.set_details("deleting backups failed: {}".format(e))
            context.set_code(grpc.StatusCode.INTERNAL)
            logging.exception("Deleting backup {} failed".format(request.name))

        return resp
def orchestrate(config, backup_name_arg, seed_target, stagger,
                enable_md5_checks, mode, temp_dir, parallel_snapshots,
                parallel_uploads):
    backup = None
    backup_name = backup_name_arg or datetime.datetime.now().strftime(
        '%Y%m%d%H%M')
    monitoring = Monitoring(config=config.monitoring)
    try:
        backup_start_time = datetime.datetime.now()
        if not config.storage.fqdn:
            err_msg = "The fqdn was not provided nor calculated properly."
            logging.error(err_msg)
            raise Exception(err_msg)

        if not temp_dir.is_dir():
            err_msg = '{} is not a directory'.format(temp_dir)
            logging.error(err_msg)
            raise Exception(err_msg)

        try:
            # Try to get a backup with backup_name. If it exists then we cannot take another backup with that name
            storage = Storage(config=config.storage)
            cluster_backup = storage.get_cluster_backup(backup_name)
            if cluster_backup:
                err_msg = 'Backup named {} already exists.'.format(backup_name)
                logging.error(err_msg)
                raise Exception(err_msg)
        except KeyError:
            info_msg = 'Starting backup {}'.format(backup_name)
            logging.info(info_msg)

        backup = BackupJob(config, backup_name, seed_target, stagger,
                           enable_md5_checks, mode, temp_dir,
                           parallel_snapshots, parallel_uploads)
        backup.execute()

        backup_end_time = datetime.datetime.now()
        backup_duration = backup_end_time - backup_start_time

        logging.debug('Emitting metrics')

        logging.info('Backup duration: {}'.format(backup_duration.seconds))
        tags = [
            'medusa-cluster-backup', 'cluster-backup-duration', backup_name
        ]
        monitoring.send(tags, backup_duration.seconds)

        tags = ['medusa-cluster-backup', 'cluster-backup-error', backup_name]
        monitoring.send(tags, 0)

        logging.debug('Done emitting metrics.')
        logging.info('Backup of the cluster done.')

    except Exception as e:
        tags = ['medusa-cluster-backup', 'cluster-backup-error', backup_name]
        monitoring.send(tags, 1)

        logging.error(
            'This error happened during the cluster backup: {}'.format(str(e)))
        traceback.print_exc()

        if backup is not None:
            err_msg = 'Something went wrong! Attempting to clean snapshots and exit.'
            logging.error(err_msg)

            delete_snapshot_command = ' '.join(
                backup.cassandra.delete_snapshot_command(backup.snapshot_tag))
            pssh_run_success_cleanup = backup.orchestration_uploads\
                .pssh_run(backup.hosts,
                          delete_snapshot_command,
                          hosts_variables={})
            if pssh_run_success_cleanup:
                info_msg = 'All nodes successfully cleared their snapshot.'
                logging.info(info_msg)
            else:
                err_msg_cleanup = 'Some nodes failed to clear the snapshot. Cleaning snapshots manually is recommended'
                logging.error(err_msg_cleanup)
        sys.exit(1)
Esempio n. 9
0
class MedusaService(medusa_pb2_grpc.MedusaServicer):
    def __init__(self, config):
        logging.info("Init service")
        self.config = config
        self.storage = Storage(config=self.config.storage)

    def AsyncBackup(self, request, context):
        # TODO pass the staggered arg
        logging.info("Performing ASYNC backup {} (type={})".format(
            request.name, request.mode))
        response = medusa_pb2.BackupResponse()
        mode = BACKUP_MODE_DIFFERENTIAL
        if medusa_pb2.BackupRequest.Mode.FULL == request.mode:
            mode = BACKUP_MODE_FULL

        try:
            response.backupName = request.name
            response.status = response.status = medusa_pb2.StatusType.IN_PROGRESS
            with ThreadPoolExecutor(
                    max_workers=1,
                    thread_name_prefix=request.name) as executor:
                BackupMan.register_backup(request.name, is_async=True)
                backup_future = executor.submit(backup_node.handle_backup,
                                                config=self.config,
                                                backup_name_arg=request.name,
                                                stagger_time=None,
                                                enable_md5_checks_flag=False,
                                                mode=mode)

                backup_future.add_done_callback(record_backup_info)
                BackupMan.set_backup_future(request.name, backup_future)

        except Exception as e:

            response.status = medusa_pb2.StatusType.FAILED
            if request.name:
                BackupMan.update_backup_status(request.name,
                                               BackupMan.STATUS_FAILED)

            context.set_details("Failed to create async backup: {}".format(e))
            context.set_code(grpc.StatusCode.INTERNAL)
            logging.exception("Async backup failed due to error: {}".format(e))

        return response

    def Backup(self, request, context):
        # TODO pass the staggered arg
        logging.info("Performing SYNC backup {} (type={})".format(
            request.name, request.mode))
        response = medusa_pb2.BackupResponse()
        mode = BACKUP_MODE_DIFFERENTIAL
        if medusa_pb2.BackupRequest.Mode.FULL == request.mode:
            mode = BACKUP_MODE_FULL

        try:
            response.backupName = request.name
            BackupMan.register_backup(request.name, is_async=False)
            backup_node.handle_backup(config=self.config,
                                      backup_name_arg=request.name,
                                      stagger_time=None,
                                      enable_md5_checks_flag=False,
                                      mode=mode)
            record_status_in_response(response, request.name)
            return response
        except Exception as e:
            response.status = medusa_pb2.StatusType.FAILED
            if request.name:
                BackupMan.update_backup_status(request.name,
                                               BackupMan.STATUS_FAILED)

            context.set_details("Failed to create sync backups: {}".format(e))
            context.set_code(grpc.StatusCode.INTERNAL)
            logging.exception("Sync backup failed due to error: {}".format(e))

        return response

    def BackupStatus(self, request, context):

        response = medusa_pb2.BackupStatusResponse()
        try:
            backup = self.storage.get_cluster_backup(request.backupName)

            # TODO how is the startTime determined?
            response.startTime = datetime.fromtimestamp(
                backup.started).strftime(TIMESTAMP_FORMAT)
            response.finishedNodes.extend(
                [node.fqdn for node in backup.complete_nodes()])
            response.unfinishedNodes.extend(
                [node.fqdn for node in backup.incomplete_nodes()])
            response.missingNodes.extend(
                [node.fqdn for node in backup.missing_nodes()])

            if backup.finished:
                response.finishTime = datetime.fromtimestamp(
                    backup.finished).strftime(TIMESTAMP_FORMAT)
            else:
                response.finishTime = ""

            record_status_in_response(response, request.backupName)
        except KeyError:
            context.set_details("backup <{}> does not exist".format(
                request.backupName))
            context.set_code(grpc.StatusCode.NOT_FOUND)
            response.status = medusa_pb2.StatusType.UNKNOWN
        return response

    def GetBackups(self, request, context):
        response = medusa_pb2.GetBackupsResponse()
        last_status = medusa_pb2.StatusType.UNKNOWN
        try:
            # cluster backups
            backups = get_backups(self.config, True)
            for backup in backups:
                summary = medusa_pb2.BackupSummary()
                summary.backupName = backup.name
                if backup.started is None:
                    summary.startTime = 0
                else:
                    summary.startTime = backup.started

                if backup.finished is None:
                    summary.finishTime = 0
                    summary.status = medusa_pb2.StatusType.IN_PROGRESS
                    last_status = medusa_pb2.StatusType.IN_PROGRESS
                else:
                    summary.finishTime = backup.finished
                    if last_status != medusa_pb2.StatusType.IN_PROGRESS:
                        summary.status = medusa_pb2.StatusType.SUCCESS

                summary.totalNodes = len(backup.tokenmap)
                summary.finishedNodes = len(backup.complete_nodes())

                for node in backup.tokenmap:
                    summary.nodes.append(create_token_map_node(backup, node))

                response.backups.append(summary)

        except Exception as e:
            context.set_details(
                "Failed to get backups due to error: {}".format(e))
            context.set_code(grpc.StatusCode.INTERNAL)
            response.status = medusa_pb2.StatusType.UNKNOWN
        return response

    def DeleteBackup(self, request, context):
        logging.info("Deleting backup {}".format(request.name))
        response = medusa_pb2.DeleteBackupResponse()

        try:
            delete_backup(self.config, [request.name], True)
            handle_backup_removal(request.name)
        except Exception as e:
            context.set_details("deleting backups failed: {}".format(e))
            context.set_code(grpc.StatusCode.INTERNAL)
            logging.exception("Deleting backup {} failed".format(request.name))
        return response