def delete_backup(config, backup_name, all_nodes): backups_to_purge = list() monitoring = Monitoring(config=config.monitoring) try: storage = Storage(config=config.storage) cluster_backup = storage.get_cluster_backup(backup_name) backups_to_purge = cluster_backup.node_backups.values() if not all_nodes: backups_to_purge = [ nb for nb in backups_to_purge if storage.config.fqdn in nb.fqdn ] logging.info('Deleting Backup {}...'.format(backup_name)) purge_backups(storage, backups_to_purge) logging.debug('Emitting metrics') tags = ['medusa-node-backup', 'delete-error', 'DELETE-ERROR'] monitoring.send(tags, 0) except Exception as e: tags = ['medusa-node-backup', 'delete-error', 'DELETE-ERROR'] monitoring.send(tags, 1) medusa.utils.handle_exception( e, 'This error happened during the delete of backup "{}": {}'.format( backup_name, str(e)), config)
def verify(config, backup_name): storage = Storage(config=config.storage) try: cluster_backup = storage.get_cluster_backup(backup_name) except KeyError: logging.error('No such backup') raise RuntimeError("Manifest validation failed") print('Validating {0.name} ...'.format(cluster_backup)) if cluster_backup.is_complete(): print('- Completion: OK!') else: print('- Completion: Not complete!') for incomplete_node in cluster_backup.incomplete_nodes(): print( ' - [{0.fqdn}] Backup started at {0.started}, but not finished yet' .format(incomplete_node)) for fqdn in cluster_backup.missing_nodes(): print(' - [{}] Backup missing'.format(fqdn)) consistency_errors = [ consistency_error for node_backup in cluster_backup.node_backups.values() for consistency_error in validate_manifest(storage, node_backup) ] if consistency_errors: print("- Manifest validation: Failed!") for error in consistency_errors: print(error) raise RuntimeError("Manifest validation failed") else: print("- Manifest validated: OK!!")
def orchestrate(config, backup_name, seed_target, temp_dir, host_list, keep_auth, bypass_checks, verify, keyspaces, tables, parallel_restores, use_sstableloader=False): monitoring = Monitoring(config=config.monitoring) try: restore_start_time = datetime.datetime.now() if seed_target is None and host_list is None: # if no target node is provided, nor a host list file, default to the local node as seed target hostname_resolver = HostnameResolver(medusa.utils.evaluate_boolean(config.cassandra.resolve_ip_addresses)) seed_target = hostname_resolver.resolve_fqdn(socket.gethostbyname(socket.getfqdn())) logging.warning("Seed target was not provided, using the local hostname: {}".format(seed_target)) if seed_target is not None and host_list is not None: err_msg = 'You must either provide a seed target or a list of host, not both' logging.error(err_msg) raise Exception(err_msg) if not temp_dir.is_dir(): err_msg = '{} is not a directory'.format(temp_dir) logging.error(err_msg) raise Exception(err_msg) storage = Storage(config=config.storage) try: cluster_backup = storage.get_cluster_backup(backup_name) except KeyError: err_msg = 'No such backup --> {}'.format(backup_name) logging.error(err_msg) raise Exception(err_msg) restore = RestoreJob(cluster_backup, config, temp_dir, host_list, seed_target, keep_auth, verify, parallel_restores, keyspaces, tables, bypass_checks, use_sstableloader) restore.execute() restore_end_time = datetime.datetime.now() restore_duration = restore_end_time - restore_start_time logging.debug('Emitting metrics') logging.info('Restore duration: {}'.format(restore_duration.seconds)) tags = ['medusa-cluster-restore', 'restore-duration', backup_name] monitoring.send(tags, restore_duration.seconds) tags = ['medusa-cluster-restore', 'restore-error', backup_name] monitoring.send(tags, 0) logging.debug('Done emitting metrics') logging.info('Successfully restored the cluster') except Exception as e: tags = ['medusa-cluster-restore', 'restore-error', backup_name] monitoring.send(tags, 1) logging.error('This error happened during the cluster restore: {}'.format(str(e))) traceback.print_exc() sys.exit(1)
def main(config, backup_name): storage = Storage(config=config.storage) backup = storage.get_cluster_backup(backup_name) if not backup: logging.error('No such backup') sys.exit(1) for hostname, ringitem in backup.tokenmap.items(): print(hostname) print(ringitem['tokens'])
def status(config, backup_name): storage = Storage(config=config.storage) try: cluster_backup = storage.get_cluster_backup(backup_name) except KeyError: logging.error('No such backup') sys.exit(1) if cluster_backup.is_complete(): print('{.name}'.format(cluster_backup)) else: print('{.name} [Incomplete!]'.format(cluster_backup)) started = datetime.fromtimestamp( cluster_backup.started).strftime(TIMESTAMP_FORMAT) if cluster_backup.finished is None: print('- Started: {}, ' 'Finished: never'.format(started)) else: finished = datetime.fromtimestamp( cluster_backup.finished).strftime(TIMESTAMP_FORMAT) print('- Started: {}, ' 'Finished: {}'.format(started, finished)) complete_nodes = cluster_backup.complete_nodes() incomplete_nodes = cluster_backup.incomplete_nodes() missing_nodes = cluster_backup.missing_nodes() print('- {0} nodes completed, {1} nodes incomplete, {2} nodes missing'. format(len(complete_nodes), len(incomplete_nodes), len(missing_nodes))) if len(incomplete_nodes) > 0: print('- Incomplete nodes:') for node_backup in incomplete_nodes: print(' {}'.format(node_backup.fqdn)) if len(missing_nodes) > 0: print('- Missing nodes:') for fqdn in missing_nodes: print(' {}'.format(fqdn)) print('- {} files, {}'.format(cluster_backup.num_objects(), format_bytes_str(cluster_backup.size())))
def orchestrate(config, backup_name, seed_target, temp_dir, host_list, keep_auth, bypass_checks, verify, keyspaces, tables, pssh_pool_size, use_sstableloader=False): monitoring = Monitoring(config=config.monitoring) try: restore_start_time = datetime.datetime.now() if seed_target is not None: keep_auth = False if seed_target is None and host_list is None: err_msg = 'You must either provide a seed target or a list of host' logging.error(err_msg) raise Exception(err_msg) if seed_target is not None and host_list is not None: err_msg = 'You must either provide a seed target or a list of host, not both' logging.error(err_msg) raise Exception(err_msg) if not temp_dir.is_dir(): err_msg = '{} is not a directory'.format(temp_dir) logging.error(err_msg) raise Exception(err_msg) if keep_auth: logging.info( 'system_auth keyspace will be left untouched on the target nodes' ) else: logging.info( 'system_auth keyspace will be overwritten with the backup on target nodes' ) storage = Storage(config=config.storage) try: cluster_backup = storage.get_cluster_backup(backup_name) except KeyError: err_msg = 'No such backup --> {}'.format(backup_name) logging.error(err_msg) raise Exception(err_msg) restore = RestoreJob(cluster_backup, config, temp_dir, host_list, seed_target, keep_auth, verify, pssh_pool_size, keyspaces, tables, bypass_checks, use_sstableloader) restore.execute() restore_end_time = datetime.datetime.now() restore_duration = restore_end_time - restore_start_time logging.debug('Emitting metrics') logging.info('Restore duration: {}'.format(restore_duration.seconds)) tags = ['medusa-cluster-restore', 'restore-duration', backup_name] monitoring.send(tags, restore_duration.seconds) tags = ['medusa-cluster-restore', 'restore-error', backup_name] monitoring.send(tags, 0) logging.debug('Done emitting metrics') logging.info('Successfully restored the cluster') except Exception as e: tags = ['medusa-cluster-restore', 'restore-error', backup_name] monitoring.send(tags, 1) logging.error( 'This error happened during the cluster restore: {}'.format( str(e))) traceback.print_exc() sys.exit(1)
class MedusaService(medusa_pb2_grpc.MedusaServicer): def __init__(self, config): logging.info("Init service") self.config = config self.storage = Storage(config=self.config.storage) def Backup(self, request, context): logging.info("Performing backup {}".format(request.name)) resp = medusa_pb2.BackupResponse() # TODO pass the staggered and mode args try: medusa.backup_node.main(self.config, request.name, None, False, "differential") except Exception as e: context.set_details("failed to create backups: {}".format(e)) context.set_code(grpc.StatusCode.INTERNAL) logging.exception("backup failed") return resp def BackupStatus(self, request, context): response = medusa_pb2.BackupStatusResponse() try: backup = self.storage.get_cluster_backup(request.backupName) # TODO how is the startTime determined? response.startTime = datetime.fromtimestamp( backup.started).strftime(TIMESTAMP_FORMAT) response.finishedNodes = [ node.fqdn for node in backup.complete_nodes() ] response.unfinishedNodes = [ node.fqdn for node in backup.incomplete_nodes() ] response.missingNodes = [ node.fqdn for node in backup.missing_nodes() ] if backup.finished: response.finishTime = datetime.fromtimestamp( backup.finished).strftime(TIMESTAMP_FORMAT) else: response.finishTime = "" return response except KeyError: context.set_details("backup <{}> does not exist".format( request.backupName)) context.set_code(grpc.StatusCode.NOT_FOUND) return response def GetBackups(self, request, context): response = medusa_pb2.GetBackupsResponse() try: backups = medusa.listing.get_backups(self.config, True) for backup in backups: summary = medusa_pb2.BackupSummary() summary.backupName = backup.name if backup.started is None: summary.starTime = 0 else: summary.startTime = backup.started if backup.finished is None: summary.finishTime = 0 else: summary.finishTime = backup.finished summary.totalNodes = len(backup.tokenmap) summary.finishedNodes = len(backup.complete_nodes()) for node in backup.tokenmap: tokenmapNode = medusa_pb2.BackupNode() tokenmapNode.host = node tokenmapNode.datacenter = backup.tokenmap[node][ "dc"] if "dc" in backup.tokenmap[node] else "" tokenmapNode.rack = backup.tokenmap[node][ "rack"] if "rack" in backup.tokenmap[node] else "" if "tokens" in backup.tokenmap[node]: for token in backup.tokenmap[node]["tokens"]: tokenmapNode.tokens.append(token) summary.nodes.append(tokenmapNode) response.backups.append(summary) return response except Exception as e: context.set_details("failed to get backups: {}".format(e)) context.set_code(grpc.StatusCode.INTERNAL) return response def DeleteBackup(self, request, context): logging.info("Deleting backup {}".format(request.name)) resp = medusa_pb2.DeleteBackupResponse() try: medusa.purge.delete_backup(self.config, request.name, True) except Exception as e: context.set_details("deleting backups failed: {}".format(e)) context.set_code(grpc.StatusCode.INTERNAL) logging.exception("Deleting backup {} failed".format(request.name)) return resp
def orchestrate(config, backup_name_arg, seed_target, stagger, enable_md5_checks, mode, temp_dir, parallel_snapshots, parallel_uploads): backup = None backup_name = backup_name_arg or datetime.datetime.now().strftime( '%Y%m%d%H%M') monitoring = Monitoring(config=config.monitoring) try: backup_start_time = datetime.datetime.now() if not config.storage.fqdn: err_msg = "The fqdn was not provided nor calculated properly." logging.error(err_msg) raise Exception(err_msg) if not temp_dir.is_dir(): err_msg = '{} is not a directory'.format(temp_dir) logging.error(err_msg) raise Exception(err_msg) try: # Try to get a backup with backup_name. If it exists then we cannot take another backup with that name storage = Storage(config=config.storage) cluster_backup = storage.get_cluster_backup(backup_name) if cluster_backup: err_msg = 'Backup named {} already exists.'.format(backup_name) logging.error(err_msg) raise Exception(err_msg) except KeyError: info_msg = 'Starting backup {}'.format(backup_name) logging.info(info_msg) backup = BackupJob(config, backup_name, seed_target, stagger, enable_md5_checks, mode, temp_dir, parallel_snapshots, parallel_uploads) backup.execute() backup_end_time = datetime.datetime.now() backup_duration = backup_end_time - backup_start_time logging.debug('Emitting metrics') logging.info('Backup duration: {}'.format(backup_duration.seconds)) tags = [ 'medusa-cluster-backup', 'cluster-backup-duration', backup_name ] monitoring.send(tags, backup_duration.seconds) tags = ['medusa-cluster-backup', 'cluster-backup-error', backup_name] monitoring.send(tags, 0) logging.debug('Done emitting metrics.') logging.info('Backup of the cluster done.') except Exception as e: tags = ['medusa-cluster-backup', 'cluster-backup-error', backup_name] monitoring.send(tags, 1) logging.error( 'This error happened during the cluster backup: {}'.format(str(e))) traceback.print_exc() if backup is not None: err_msg = 'Something went wrong! Attempting to clean snapshots and exit.' logging.error(err_msg) delete_snapshot_command = ' '.join( backup.cassandra.delete_snapshot_command(backup.snapshot_tag)) pssh_run_success_cleanup = backup.orchestration_uploads\ .pssh_run(backup.hosts, delete_snapshot_command, hosts_variables={}) if pssh_run_success_cleanup: info_msg = 'All nodes successfully cleared their snapshot.' logging.info(info_msg) else: err_msg_cleanup = 'Some nodes failed to clear the snapshot. Cleaning snapshots manually is recommended' logging.error(err_msg_cleanup) sys.exit(1)
class MedusaService(medusa_pb2_grpc.MedusaServicer): def __init__(self, config): logging.info("Init service") self.config = config self.storage = Storage(config=self.config.storage) def AsyncBackup(self, request, context): # TODO pass the staggered arg logging.info("Performing ASYNC backup {} (type={})".format( request.name, request.mode)) response = medusa_pb2.BackupResponse() mode = BACKUP_MODE_DIFFERENTIAL if medusa_pb2.BackupRequest.Mode.FULL == request.mode: mode = BACKUP_MODE_FULL try: response.backupName = request.name response.status = response.status = medusa_pb2.StatusType.IN_PROGRESS with ThreadPoolExecutor( max_workers=1, thread_name_prefix=request.name) as executor: BackupMan.register_backup(request.name, is_async=True) backup_future = executor.submit(backup_node.handle_backup, config=self.config, backup_name_arg=request.name, stagger_time=None, enable_md5_checks_flag=False, mode=mode) backup_future.add_done_callback(record_backup_info) BackupMan.set_backup_future(request.name, backup_future) except Exception as e: response.status = medusa_pb2.StatusType.FAILED if request.name: BackupMan.update_backup_status(request.name, BackupMan.STATUS_FAILED) context.set_details("Failed to create async backup: {}".format(e)) context.set_code(grpc.StatusCode.INTERNAL) logging.exception("Async backup failed due to error: {}".format(e)) return response def Backup(self, request, context): # TODO pass the staggered arg logging.info("Performing SYNC backup {} (type={})".format( request.name, request.mode)) response = medusa_pb2.BackupResponse() mode = BACKUP_MODE_DIFFERENTIAL if medusa_pb2.BackupRequest.Mode.FULL == request.mode: mode = BACKUP_MODE_FULL try: response.backupName = request.name BackupMan.register_backup(request.name, is_async=False) backup_node.handle_backup(config=self.config, backup_name_arg=request.name, stagger_time=None, enable_md5_checks_flag=False, mode=mode) record_status_in_response(response, request.name) return response except Exception as e: response.status = medusa_pb2.StatusType.FAILED if request.name: BackupMan.update_backup_status(request.name, BackupMan.STATUS_FAILED) context.set_details("Failed to create sync backups: {}".format(e)) context.set_code(grpc.StatusCode.INTERNAL) logging.exception("Sync backup failed due to error: {}".format(e)) return response def BackupStatus(self, request, context): response = medusa_pb2.BackupStatusResponse() try: backup = self.storage.get_cluster_backup(request.backupName) # TODO how is the startTime determined? response.startTime = datetime.fromtimestamp( backup.started).strftime(TIMESTAMP_FORMAT) response.finishedNodes.extend( [node.fqdn for node in backup.complete_nodes()]) response.unfinishedNodes.extend( [node.fqdn for node in backup.incomplete_nodes()]) response.missingNodes.extend( [node.fqdn for node in backup.missing_nodes()]) if backup.finished: response.finishTime = datetime.fromtimestamp( backup.finished).strftime(TIMESTAMP_FORMAT) else: response.finishTime = "" record_status_in_response(response, request.backupName) except KeyError: context.set_details("backup <{}> does not exist".format( request.backupName)) context.set_code(grpc.StatusCode.NOT_FOUND) response.status = medusa_pb2.StatusType.UNKNOWN return response def GetBackups(self, request, context): response = medusa_pb2.GetBackupsResponse() last_status = medusa_pb2.StatusType.UNKNOWN try: # cluster backups backups = get_backups(self.config, True) for backup in backups: summary = medusa_pb2.BackupSummary() summary.backupName = backup.name if backup.started is None: summary.startTime = 0 else: summary.startTime = backup.started if backup.finished is None: summary.finishTime = 0 summary.status = medusa_pb2.StatusType.IN_PROGRESS last_status = medusa_pb2.StatusType.IN_PROGRESS else: summary.finishTime = backup.finished if last_status != medusa_pb2.StatusType.IN_PROGRESS: summary.status = medusa_pb2.StatusType.SUCCESS summary.totalNodes = len(backup.tokenmap) summary.finishedNodes = len(backup.complete_nodes()) for node in backup.tokenmap: summary.nodes.append(create_token_map_node(backup, node)) response.backups.append(summary) except Exception as e: context.set_details( "Failed to get backups due to error: {}".format(e)) context.set_code(grpc.StatusCode.INTERNAL) response.status = medusa_pb2.StatusType.UNKNOWN return response def DeleteBackup(self, request, context): logging.info("Deleting backup {}".format(request.name)) response = medusa_pb2.DeleteBackupResponse() try: delete_backup(self.config, [request.name], True) handle_backup_removal(request.name) except Exception as e: context.set_details("deleting backups failed: {}".format(e)) context.set_code(grpc.StatusCode.INTERNAL) logging.exception("Deleting backup {} failed".format(request.name)) return response