def handle_backup(config, backup_name_arg, stagger_time, enable_md5_checks_flag, mode): start = datetime.datetime.now() backup_name = backup_name_arg or start.strftime('%Y%m%d%H%M') monitoring = Monitoring(config=config.monitoring) try: logging.debug( "Starting backup preparations with Mode: {}".format(mode)) storage = Storage(config=config.storage) cassandra = Cassandra(config) differential_mode = False if mode == "differential": differential_mode = True node_backup = storage.get_node_backup( fqdn=config.storage.fqdn, name=backup_name, differential_mode=differential_mode) if node_backup.exists(): raise IOError( 'Error: Backup {} already exists'.format(backup_name)) # Starting the backup logging.info( "Starting backup using Stagger: {} Mode: {} Name: {}".format( stagger_time, mode, backup_name)) BackupMan.update_backup_status(backup_name, BackupMan.STATUS_IN_PROGRESS) info = start_backup(storage, node_backup, cassandra, differential_mode, stagger_time, start, mode, enable_md5_checks_flag, backup_name, config, monitoring) BackupMan.update_backup_status(backup_name, BackupMan.STATUS_SUCCESS) logging.debug("Done with backup, returning backup result information") return (info["actual_backup_duration"], info["actual_start_time"], info["end_time"], info["node_backup"], info["node_backup_cache"], info["num_files"], info["start_time"], info["backup_name"]) except Exception as e: logging.error( "Issue occurred inside handle_backup Name: {} Error: {}".format( backup_name, str(e))) BackupMan.update_backup_status(backup_name, BackupMan.STATUS_FAILED) tags = ['medusa-node-backup', 'backup-error', backup_name] monitoring.send(tags, 1) medusa.utils.handle_exception( e, "Error occurred during backup: {}".format(str(e)), config)
def orchestrate(config, backup_name, seed_target, temp_dir, host_list, keep_auth, bypass_checks, verify, keyspaces, tables, parallel_restores, use_sstableloader=False): monitoring = Monitoring(config=config.monitoring) try: restore_start_time = datetime.datetime.now() if seed_target is None and host_list is None: # if no target node is provided, nor a host list file, default to the local node as seed target hostname_resolver = HostnameResolver(medusa.utils.evaluate_boolean(config.cassandra.resolve_ip_addresses)) seed_target = hostname_resolver.resolve_fqdn(socket.gethostbyname(socket.getfqdn())) logging.warning("Seed target was not provided, using the local hostname: {}".format(seed_target)) if seed_target is not None and host_list is not None: err_msg = 'You must either provide a seed target or a list of host, not both' logging.error(err_msg) raise Exception(err_msg) if not temp_dir.is_dir(): err_msg = '{} is not a directory'.format(temp_dir) logging.error(err_msg) raise Exception(err_msg) storage = Storage(config=config.storage) try: cluster_backup = storage.get_cluster_backup(backup_name) except KeyError: err_msg = 'No such backup --> {}'.format(backup_name) logging.error(err_msg) raise Exception(err_msg) restore = RestoreJob(cluster_backup, config, temp_dir, host_list, seed_target, keep_auth, verify, parallel_restores, keyspaces, tables, bypass_checks, use_sstableloader) restore.execute() restore_end_time = datetime.datetime.now() restore_duration = restore_end_time - restore_start_time logging.debug('Emitting metrics') logging.info('Restore duration: {}'.format(restore_duration.seconds)) tags = ['medusa-cluster-restore', 'restore-duration', backup_name] monitoring.send(tags, restore_duration.seconds) tags = ['medusa-cluster-restore', 'restore-error', backup_name] monitoring.send(tags, 0) logging.debug('Done emitting metrics') logging.info('Successfully restored the cluster') except Exception as e: tags = ['medusa-cluster-restore', 'restore-error', backup_name] monitoring.send(tags, 1) logging.error('This error happened during the cluster restore: {}'.format(str(e))) traceback.print_exc() sys.exit(1)
def main(config, max_backup_age=0, max_backup_count=0): backups_to_purge = set() monitoring = Monitoring(config=config.monitoring) try: logging.info('Starting purge') storage = Storage(config=config.storage) # Get all backups for the local node logging.info('Listing backups for {}'.format(config.storage.fqdn)) backup_index = storage.list_backup_index_blobs() backups = list( storage.list_node_backups(fqdn=config.storage.fqdn, backup_index_blobs=backup_index)) # list all backups to purge based on date conditions backups_to_purge |= set( backups_to_purge_by_age(backups, max_backup_age)) # list all backups to purge based on count conditions backups_to_purge |= set( backups_to_purge_by_count(backups, max_backup_count)) # purge all candidate backups purge_backups(storage, backups_to_purge) logging.debug('Emitting metrics') tags = ['medusa-node-backup', 'purge-error', 'PURGE-ERROR'] monitoring.send(tags, 0) except Exception as e: traceback.print_exc() tags = ['medusa-node-backup', 'purge-error', 'PURGE-ERROR'] monitoring.send(tags, 1) logging.error('This error happened during the purge: {}'.format( str(e))) sys.exit(1)
def delete_backup(config, backup_name, all_nodes): backups_to_purge = list() monitoring = Monitoring(config=config.monitoring) try: storage = Storage(config=config.storage) cluster_backup = storage.get_cluster_backup(backup_name) backups_to_purge = cluster_backup.node_backups.values() if not all_nodes: backups_to_purge = [ nb for nb in backups_to_purge if storage.config.fqdn in nb.fqdn ] logging.info('Deleting Backup {}...'.format(backup_name)) purge_backups(storage, backups_to_purge) logging.debug('Emitting metrics') tags = ['medusa-node-backup', 'delete-error', 'DELETE-ERROR'] monitoring.send(tags, 0) except Exception as e: tags = ['medusa-node-backup', 'delete-error', 'DELETE-ERROR'] monitoring.send(tags, 1) medusa.utils.handle_exception( e, 'This error happened during the delete of backup "{}": {}'.format( backup_name, str(e)), config)
def report_latest(config, push_metrics): MAX_RETRIES = 3 SLEEP_TIME = 15 retry = 0 monitoring = Monitoring(config=config.monitoring) for retry in range(MAX_RETRIES): try: logging.debug('Trying to report about existing backups ({}/{})...'.format( retry + 1, MAX_RETRIES )) storage = Storage(config=config.storage) fqdn = config.storage.fqdn backup_index = storage.list_backup_index_blobs() check_node_backup(config, storage, fqdn, push_metrics, monitoring) check_complete_cluster_backup(storage, push_metrics, monitoring, backup_index) check_latest_cluster_backup(storage, push_metrics, monitoring, backup_index) break except Exception as e: if (retry + 1) < MAX_RETRIES: logging.debug('Report attempt {} failed, waiting {} seconds to retry'.format( retry + 1, SLEEP_TIME )) time.sleep(SLEEP_TIME) continue else: logging.error('This error happened during the check: {}'.format(e), exc_info=True) if push_metrics: # Set latest known complete backup to ~ 10 years ago to attract the attention # of the operator on the broken monitoring. logging.info("Sending a big value to 'seconds-since-backup' metric to trigger alerts.") long_time_flag_value = 315365400 tags = ['medusa-cluster-backup', 'seconds-since-backup', 'TRACKING-ERROR'] monitoring.send(tags, long_time_flag_value)
def delete_backup(config, backup_names, all_nodes): monitoring = Monitoring(config=config.monitoring) try: storage = Storage(config=config.storage) cluster_backups = storage.list_cluster_backups() backups_to_purge = backups_to_purge_by_name(storage, cluster_backups, backup_names, all_nodes) logging.info('Deleting Backup(s) {}...'.format(",".join(backup_names))) purge_backups(storage, backups_to_purge, config.storage.backup_grace_period_in_days, storage.config.fqdn) logging.debug('Emitting metrics') tags = ['medusa-node-backup', 'delete-error', 'DELETE-ERROR'] monitoring.send(tags, 0) except Exception as e: tags = ['medusa-node-backup', 'delete-error', 'DELETE-ERROR'] monitoring.send(tags, 1) medusa.utils.handle_exception( e, 'This error happened during the delete of backup(s) "{}": {}'. format(",".join(backup_names), str(e)), config)
def orchestrate(config, backup_name, seed_target, temp_dir, host_list, keep_auth, bypass_checks, verify, keyspaces, tables, pssh_pool_size, use_sstableloader=False): monitoring = Monitoring(config=config.monitoring) try: restore_start_time = datetime.datetime.now() if seed_target is not None: keep_auth = False if seed_target is None and host_list is None: err_msg = 'You must either provide a seed target or a list of host' logging.error(err_msg) raise Exception(err_msg) if seed_target is not None and host_list is not None: err_msg = 'You must either provide a seed target or a list of host, not both' logging.error(err_msg) raise Exception(err_msg) if not temp_dir.is_dir(): err_msg = '{} is not a directory'.format(temp_dir) logging.error(err_msg) raise Exception(err_msg) if keep_auth: logging.info( 'system_auth keyspace will be left untouched on the target nodes' ) else: logging.info( 'system_auth keyspace will be overwritten with the backup on target nodes' ) storage = Storage(config=config.storage) try: cluster_backup = storage.get_cluster_backup(backup_name) except KeyError: err_msg = 'No such backup --> {}'.format(backup_name) logging.error(err_msg) raise Exception(err_msg) restore = RestoreJob(cluster_backup, config, temp_dir, host_list, seed_target, keep_auth, verify, pssh_pool_size, keyspaces, tables, bypass_checks, use_sstableloader) restore.execute() restore_end_time = datetime.datetime.now() restore_duration = restore_end_time - restore_start_time logging.debug('Emitting metrics') logging.info('Restore duration: {}'.format(restore_duration.seconds)) tags = ['medusa-cluster-restore', 'restore-duration', backup_name] monitoring.send(tags, restore_duration.seconds) tags = ['medusa-cluster-restore', 'restore-error', backup_name] monitoring.send(tags, 0) logging.debug('Done emitting metrics') logging.info('Successfully restored the cluster') except Exception as e: tags = ['medusa-cluster-restore', 'restore-error', backup_name] monitoring.send(tags, 1) logging.error( 'This error happened during the cluster restore: {}'.format( str(e))) traceback.print_exc() sys.exit(1)
def main(config, backup_name_arg, stagger_time, mode): start = datetime.datetime.now() backup_name = backup_name_arg or start.strftime('%Y%m%d%H') monitoring = Monitoring(config=config.monitoring) try: storage = Storage(config=config.storage) cassandra = Cassandra(config) differential_mode = False if mode == "differential": differential_mode = True node_backup = storage.get_node_backup( fqdn=config.storage.fqdn, name=backup_name, differential_mode=differential_mode ) if node_backup.exists(): raise IOError('Error: Backup {} already exists'.format(backup_name)) # Make sure that priority remains to Cassandra/limiting backups resource usage try: throttle_backup() except Exception: logging.warning("Throttling backup impossible. It's probable that ionice is not available.") logging.info('Saving tokenmap and schema') schema, tokenmap = get_schema_and_tokenmap(cassandra) node_backup.schema = schema node_backup.tokenmap = json.dumps(tokenmap) if differential_mode is True: node_backup.differential = mode add_backup_start_to_index(storage, node_backup) if stagger_time: stagger_end = start + stagger_time logging.info('Staggering backup run, trying until {}'.format(stagger_end)) while not stagger(config.storage.fqdn, storage, tokenmap): if datetime.datetime.now() < stagger_end: logging.info('Staggering this backup run...') time.sleep(60) else: raise IOError('Backups on previous nodes did not complete' ' within our stagger time.') actual_start = datetime.datetime.now() num_files, node_backup_cache = do_backup( cassandra, node_backup, storage, differential_mode, config, backup_name) end = datetime.datetime.now() actual_backup_duration = end - actual_start print_backup_stats(actual_backup_duration, actual_start, end, node_backup, node_backup_cache, num_files, start) update_monitoring(actual_backup_duration, backup_name, monitoring, node_backup) return (actual_backup_duration, actual_start, end, node_backup, node_backup_cache, num_files, start) except Exception as e: tags = ['medusa-node-backup', 'backup-error', backup_name] monitoring.send(tags, 1) medusa.utils.handle_exception( e, "This error happened during the backup: {}".format(str(e)), config )
def orchestrate(config, backup_name_arg, seed_target, stagger, enable_md5_checks, mode, temp_dir, parallel_snapshots, parallel_uploads): backup = None backup_name = backup_name_arg or datetime.datetime.now().strftime( '%Y%m%d%H%M') monitoring = Monitoring(config=config.monitoring) try: backup_start_time = datetime.datetime.now() if not config.storage.fqdn: err_msg = "The fqdn was not provided nor calculated properly." logging.error(err_msg) raise Exception(err_msg) if not temp_dir.is_dir(): err_msg = '{} is not a directory'.format(temp_dir) logging.error(err_msg) raise Exception(err_msg) try: # Try to get a backup with backup_name. If it exists then we cannot take another backup with that name storage = Storage(config=config.storage) cluster_backup = storage.get_cluster_backup(backup_name) if cluster_backup: err_msg = 'Backup named {} already exists.'.format(backup_name) logging.error(err_msg) raise Exception(err_msg) except KeyError: info_msg = 'Starting backup {}'.format(backup_name) logging.info(info_msg) backup = BackupJob(config, backup_name, seed_target, stagger, enable_md5_checks, mode, temp_dir, parallel_snapshots, parallel_uploads) backup.execute() backup_end_time = datetime.datetime.now() backup_duration = backup_end_time - backup_start_time logging.debug('Emitting metrics') logging.info('Backup duration: {}'.format(backup_duration.seconds)) tags = [ 'medusa-cluster-backup', 'cluster-backup-duration', backup_name ] monitoring.send(tags, backup_duration.seconds) tags = ['medusa-cluster-backup', 'cluster-backup-error', backup_name] monitoring.send(tags, 0) logging.debug('Done emitting metrics.') logging.info('Backup of the cluster done.') except Exception as e: tags = ['medusa-cluster-backup', 'cluster-backup-error', backup_name] monitoring.send(tags, 1) logging.error( 'This error happened during the cluster backup: {}'.format(str(e))) traceback.print_exc() if backup is not None: err_msg = 'Something went wrong! Attempting to clean snapshots and exit.' logging.error(err_msg) delete_snapshot_command = ' '.join( backup.cassandra.delete_snapshot_command(backup.snapshot_tag)) pssh_run_success_cleanup = backup.orchestration_uploads\ .pssh_run(backup.hosts, delete_snapshot_command, hosts_variables={}) if pssh_run_success_cleanup: info_msg = 'All nodes successfully cleared their snapshot.' logging.info(info_msg) else: err_msg_cleanup = 'Some nodes failed to clear the snapshot. Cleaning snapshots manually is recommended' logging.error(err_msg_cleanup) sys.exit(1)