def setUp(self): if os.path.isdir(self.local_storage_dir): shutil.rmtree(self.local_storage_dir) if os.path.isdir(self.medusa_bucket_dir): shutil.rmtree(self.medusa_bucket_dir) os.makedirs(self.local_storage_dir) config = configparser.ConfigParser(interpolation=None) config['storage'] = { 'host_file_separator': ',', 'bucket_name': 'medusa_test_bucket', 'key_file': '', 'storage_provider': 'local', 'fqdn': '127.0.0.1', 'api_key_or_username': '', 'api_secret_or_password': '', 'base_path': '/tmp', 'prefix': 'pre' } config['cassandra'] = {'is_ccm': 1} self.config = MedusaConfig( storage=_namedtuple_from_dict(StorageConfig, config['storage']), cassandra=_namedtuple_from_dict(CassandraConfig, config['cassandra']), monitoring={}, ssh=None, checks=None, logging=None, grpc=None, kubernetes=None, ) self.storage = Storage(config=self.config.storage)
def _i_can_download_the_backup_all_tables_successfully(context, backup_name): def cleanup(temp_path): if os.path.exists(temp_path) and os.path.isdir(temp_path): shutil.rmtree(temp_path) storage = Storage(config=context.medusa_config.storage) config = context.medusa_config download_path = os.path.join("/tmp", "medusa-download-all-tables/") cleanup(download_path) os.makedirs(download_path) backup = storage.get_node_backup( fqdn=config.storage.fqdn, name=backup_name, ) fqtn = set({}) medusa.download.download_data(context.medusa_config.storage, backup, fqtn, Path(download_path)) # check all manifest objects that have been backed up have been downloaded keyspaces = { section['keyspace'] for section in json.loads(backup.manifest) if section['objects'] } for ks in keyspaces: ks_path = os.path.join(download_path, ks) assert os.path.isdir(ks_path) cleanup(download_path)
def restore_node(config, temp_dir, backup_name, in_place, keep_auth, seeds, verify, keyspaces, tables, use_sstableloader=False): if in_place and keep_auth: logging.error( 'Cannot keep system_auth when restoring in-place. It would be overwritten' ) sys.exit(1) storage = Storage(config=config.storage) if not use_sstableloader: restore_node_locally(config, temp_dir, backup_name, in_place, keep_auth, seeds, storage, keyspaces, tables) else: restore_node_sstableloader(config, temp_dir, backup_name, in_place, keep_auth, seeds, storage, keyspaces, tables) if verify: hostname_resolver = HostnameResolver( medusa.config.evaluate_boolean( config.cassandra.resolve_ip_addresses)) verify_restore([hostname_resolver.resolve_fqdn()], config)
def verify(config, backup_name): storage = Storage(config=config.storage) try: cluster_backup = storage.get_cluster_backup(backup_name) except KeyError: logging.error('No such backup') raise RuntimeError("Manifest validation failed") print('Validating {0.name} ...'.format(cluster_backup)) if cluster_backup.is_complete(): print('- Completion: OK!') else: print('- Completion: Not complete!') for incomplete_node in cluster_backup.incomplete_nodes(): print( ' - [{0.fqdn}] Backup started at {0.started}, but not finished yet' .format(incomplete_node)) for fqdn in cluster_backup.missing_nodes(): print(' - [{}] Backup missing'.format(fqdn)) consistency_errors = [ consistency_error for node_backup in cluster_backup.node_backups.values() for consistency_error in validate_manifest(storage, node_backup) ] if consistency_errors: print("- Manifest validation: Failed!") for error in consistency_errors: print(error) raise RuntimeError("Manifest validation failed") else: print("- Manifest validated: OK!!")
def main(config, max_backup_age=0, max_backup_count=0): backups_to_purge = set() monitoring = Monitoring(config=config.monitoring) try: logging.info('Starting purge') storage = Storage(config=config.storage) # Get all backups for the local node logging.info('Listing backups for {}'.format(config.storage.fqdn)) backup_index = storage.list_backup_index_blobs() backups = list( storage.list_node_backups(fqdn=config.storage.fqdn, backup_index_blobs=backup_index)) # list all backups to purge based on date conditions backups_to_purge |= set( backups_to_purge_by_age(backups, max_backup_age)) # list all backups to purge based on count conditions backups_to_purge |= set( backups_to_purge_by_count(backups, max_backup_count)) # purge all candidate backups purge_backups(storage, backups_to_purge) logging.debug('Emitting metrics') tags = ['medusa-node-backup', 'purge-error', 'PURGE-ERROR'] monitoring.send(tags, 0) except Exception as e: traceback.print_exc() tags = ['medusa-node-backup', 'purge-error', 'PURGE-ERROR'] monitoring.send(tags, 1) logging.error('This error happened during the purge: {}'.format( str(e))) sys.exit(1)
def _i_can_download_the_backup_single_table_successfully( context, backup_name, fqtn): def cleanup(temp_path): if os.path.exists(temp_path) and os.path.isdir(temp_path): shutil.rmtree(temp_path) storage = Storage(config=context.medusa_config.storage) config = context.medusa_config download_path = os.path.join("/tmp", "medusa-download-one-table/") cleanup(download_path) os.makedirs(download_path) backup = storage.get_node_backup( fqdn=config.storage.fqdn, name=backup_name, ) # download_data requires fqtn with table id fqtns_to_download, _ = medusa.filtering.filter_fqtns([], [fqtn], backup.manifest, True) medusa.download.download_data(context.medusa_config.storage, backup, fqtns_to_download, Path(download_path)) # check the keyspace directory has been created ks, table = fqtn.split('.') ks_path = os.path.join(download_path, ks) assert os.path.isdir(ks_path) # check tables have been downloaded assert list(Path(ks_path).glob('{}-*/*.db'.format(table))) cleanup(download_path)
def delete_backup(config, backup_name, all_nodes): backups_to_purge = list() monitoring = Monitoring(config=config.monitoring) try: storage = Storage(config=config.storage) cluster_backup = storage.get_cluster_backup(backup_name) backups_to_purge = cluster_backup.node_backups.values() if not all_nodes: backups_to_purge = [ nb for nb in backups_to_purge if storage.config.fqdn in nb.fqdn ] logging.info('Deleting Backup {}...'.format(backup_name)) purge_backups(storage, backups_to_purge) logging.debug('Emitting metrics') tags = ['medusa-node-backup', 'delete-error', 'DELETE-ERROR'] monitoring.send(tags, 0) except Exception as e: tags = ['medusa-node-backup', 'delete-error', 'DELETE-ERROR'] monitoring.send(tags, 1) medusa.utils.handle_exception( e, 'This error happened during the delete of backup "{}": {}'.format( backup_name, str(e)), config)
def download_data(storageconfig, backup, fqtns_to_restore, destination): storage = Storage(config=storageconfig) manifest = json.loads(backup.manifest) for section in manifest: fqtn = "{}.{}".format(section['keyspace'], section['columnfamily']) dst = destination / section['keyspace'] / section['columnfamily'] srcs = [ '{}{}'.format( storage.storage_driver.get_path_prefix(backup.data_path), obj['path']) for obj in section['objects'] ] dst.mkdir(parents=True) if len(srcs) > 0 and fqtn in fqtns_to_restore: logging.info('Downloading backup data') storage.storage_driver.download_blobs(srcs, dst) elif len(srcs) == 0 and fqtn in fqtns_to_restore: logging.debug('There is nothing to download for {}'.format(fqtn)) else: logging.debug( 'Download of {} was not requested, skipping'.format(fqtn)) logging.info('Downloading backup metadata...') storage.storage_driver.download_blobs(src=[ '{}'.format(path) for path in [backup.manifest_path, backup.schema_path, backup.tokenmap_path] ], dest=destination)
def restore_node(config, temp_dir, backup_name, in_place, keep_auth, seeds, verify, keyspaces, tables, use_sstableloader=False): if in_place and keep_auth: logging.error( 'Cannot keep system_auth when restoring in-place. It would be overwritten' ) sys.exit(1) storage = Storage(config=config.storage) if not use_sstableloader: restore_node_locally(config, temp_dir, backup_name, in_place, keep_auth, seeds, storage, keyspaces, tables) else: restore_node_sstableloader(config, temp_dir, backup_name, in_place, keep_auth, seeds, storage, keyspaces, tables) if verify: verify_restore([socket.getfqdn()], config)
def _i_can_see_nb_sstables_in_the_sstable_pool(context, nb_sstables, table_name, keyspace): storage = Storage(config=context.medusa_config.storage) path = os.path.join(context.medusa_config.storage.fqdn, "data", keyspace, table_name) objects = storage.storage_driver.list_objects(path) sstables = list(filter(lambda obj: '-Data.db' in obj.name, objects)) if len(sstables) != int(nb_sstables): logging.error("{} SSTables : {}".format(len(sstables), sstables)) logging.error("Was expecting {} SSTables".format(nb_sstables)) assert len(sstables) == int(nb_sstables)
def _i_cannot_see_the_backup_named_backupname_when_i_list_the_backups(context, backup_name): storage = Storage(config=context.medusa_config.storage) cluster_backups = storage.list_cluster_backups() found = False for backup in cluster_backups: if backup.name == backup_name: found = True assert found is False
def orchestrate(config, backup_name, seed_target, temp_dir, host_list, keep_auth, bypass_checks, verify, keyspaces, tables, parallel_restores, use_sstableloader=False): monitoring = Monitoring(config=config.monitoring) try: restore_start_time = datetime.datetime.now() if seed_target is None and host_list is None: # if no target node is provided, nor a host list file, default to the local node as seed target hostname_resolver = HostnameResolver(medusa.utils.evaluate_boolean(config.cassandra.resolve_ip_addresses)) seed_target = hostname_resolver.resolve_fqdn(socket.gethostbyname(socket.getfqdn())) logging.warning("Seed target was not provided, using the local hostname: {}".format(seed_target)) if seed_target is not None and host_list is not None: err_msg = 'You must either provide a seed target or a list of host, not both' logging.error(err_msg) raise Exception(err_msg) if not temp_dir.is_dir(): err_msg = '{} is not a directory'.format(temp_dir) logging.error(err_msg) raise Exception(err_msg) storage = Storage(config=config.storage) try: cluster_backup = storage.get_cluster_backup(backup_name) except KeyError: err_msg = 'No such backup --> {}'.format(backup_name) logging.error(err_msg) raise Exception(err_msg) restore = RestoreJob(cluster_backup, config, temp_dir, host_list, seed_target, keep_auth, verify, parallel_restores, keyspaces, tables, bypass_checks, use_sstableloader) restore.execute() restore_end_time = datetime.datetime.now() restore_duration = restore_end_time - restore_start_time logging.debug('Emitting metrics') logging.info('Restore duration: {}'.format(restore_duration.seconds)) tags = ['medusa-cluster-restore', 'restore-duration', backup_name] monitoring.send(tags, restore_duration.seconds) tags = ['medusa-cluster-restore', 'restore-error', backup_name] monitoring.send(tags, 0) logging.debug('Done emitting metrics') logging.info('Successfully restored the cluster') except Exception as e: tags = ['medusa-cluster-restore', 'restore-error', backup_name] monitoring.send(tags, 1) logging.error('This error happened during the cluster restore: {}'.format(str(e))) traceback.print_exc() sys.exit(1)
def main(config, backup_name): storage = Storage(config=config.storage) backup = storage.get_cluster_backup(backup_name) if not backup: logging.error('No such backup') sys.exit(1) for hostname, ringitem in backup.tokenmap.items(): print(hostname) print(ringitem['tokens'])
def cleanup_storage(context, storage_provider): if storage_provider == "local": if os.path.isdir(os.path.join("/tmp", "medusa_it_bucket")): shutil.rmtree(os.path.join("/tmp", "medusa_it_bucket")) os.makedirs(os.path.join("/tmp", "medusa_it_bucket")) else: storage = Storage(config=context.medusa_config.storage) objects = storage.storage_driver.list_objects(storage._prefix) for obj in objects: storage.storage_driver.delete_object(obj)
def _the_backup_named_backupname_has_nb_sstables_for_the_whatever_table( context, backup_name, nb_sstables, table_name, keyspace): storage = Storage(config=context.medusa_config.storage) path = os.path.join(context.medusa_config.storage.fqdn, backup_name, "data", keyspace, table_name) objects = storage.storage_driver.list_objects(path) sstables = list(filter(lambda obj: "-Data.db" in obj.name, objects)) if len(sstables) != int(nb_sstables): logging.error("{} SSTables : {}".format(len(sstables), sstables)) logging.error("Was expecting {} SSTables".format(nb_sstables)) assert len(sstables) == int(nb_sstables)
def _i_can_see_secondary_index_files_in_backup(context, backup_name): storage = Storage(config=context.medusa_config.storage) node_backups = storage.list_node_backups() target_backup = list(filter(lambda backup: backup.name == backup_name, node_backups))[0] manifest = json.loads(target_backup.manifest) seen_index_files = 0 for section in manifest: for f in section['objects']: if 'idx' in f['path']: seen_index_files += 1 assert seen_index_files > 0
def get_backups(config, show_all): storage = Storage(config=config.storage) cluster_backups = sorted(storage.list_cluster_backups(), key=lambda b: b.started) if not show_all: cluster_backups = filter( lambda cluster_backup: config.storage.fqdn in cluster_backup. node_backups, cluster_backups) return cluster_backups
def _backup_named_something_has_nb_files_in_the_manifest(context, backup_name, nb_files, table_name, keyspace_name): storage = Storage(config=context.medusa_config.storage) node_backups = storage.list_node_backups() # Find the backup we're looking for target_backup = list(filter(lambda backup: backup.name == backup_name, node_backups))[0] # Parse its manifest manifest = json.loads(target_backup.manifest) for section in manifest: if section['keyspace'] == keyspace_name and section['columnfamily'][:len(table_name)] == table_name: if len(section['objects']) != int(nb_files): logging.error("Was expecting {} files, got {}".format(nb_files, len(section['objects']))) assert len(section['objects']) == int(nb_files)
def download_cmd(config, backup_name, download_destination): storage = Storage(config=config.storage) if not download_destination.is_dir(): logging.error('{} is not a directory'.format(download_destination)) sys.exit(1) node_backup = storage.get_node_backup(fqdn=storage.config.fqdn, name=backup_name) if not node_backup.exists(): logging.error('No such backup') sys.exit(1) download_data(config.storage, node_backup, download_destination)
def _the_backup_named_backupname_is_present_in_the_index(context, backup_name): storage = Storage(config=context.medusa_config.storage) fqdn = context.medusa_config.storage.fqdn path = os.path.join('index/backup_index', backup_name, 'tokenmap_{}.json'.format(fqdn)) tokenmap_from_index = storage.storage_driver.get_blob_content_as_string(path) path = os.path.join(fqdn, backup_name, 'meta', 'tokenmap.json') tokenmap_from_backup = storage.storage_driver.get_blob_content_as_string(path) # Check that we have the manifest as well there manifest_path = os.path.join('index/backup_index', backup_name, 'manifest_{}.json'.format(fqdn)) manifest_from_index = storage.storage_driver.get_blob_content_as_string(manifest_path) path = os.path.join(fqdn, backup_name, 'meta', 'manifest.json') manifest_from_backup = storage.storage_driver.get_blob_content_as_string(path) assert tokenmap_from_backup == tokenmap_from_index and manifest_from_backup == manifest_from_index
def handle_backup(config, backup_name_arg, stagger_time, enable_md5_checks_flag, mode): start = datetime.datetime.now() backup_name = backup_name_arg or start.strftime('%Y%m%d%H%M') monitoring = Monitoring(config=config.monitoring) try: logging.debug( "Starting backup preparations with Mode: {}".format(mode)) storage = Storage(config=config.storage) cassandra = Cassandra(config) differential_mode = False if mode == "differential": differential_mode = True node_backup = storage.get_node_backup( fqdn=config.storage.fqdn, name=backup_name, differential_mode=differential_mode) if node_backup.exists(): raise IOError( 'Error: Backup {} already exists'.format(backup_name)) # Starting the backup logging.info( "Starting backup using Stagger: {} Mode: {} Name: {}".format( stagger_time, mode, backup_name)) BackupMan.update_backup_status(backup_name, BackupMan.STATUS_IN_PROGRESS) info = start_backup(storage, node_backup, cassandra, differential_mode, stagger_time, start, mode, enable_md5_checks_flag, backup_name, config, monitoring) BackupMan.update_backup_status(backup_name, BackupMan.STATUS_SUCCESS) logging.debug("Done with backup, returning backup result information") return (info["actual_backup_duration"], info["actual_start_time"], info["end_time"], info["node_backup"], info["node_backup_cache"], info["num_files"], info["start_time"], info["backup_name"]) except Exception as e: logging.error( "Issue occurred inside handle_backup Name: {} Error: {}".format( backup_name, str(e))) BackupMan.update_backup_status(backup_name, BackupMan.STATUS_FAILED) tags = ['medusa-node-backup', 'backup-error', backup_name] monitoring.send(tags, 1) medusa.utils.handle_exception( e, "Error occurred during backup: {}".format(str(e)), config)
def setUp(self): config = configparser.ConfigParser(interpolation=None) config['storage'] = { 'host_file_separator': ',', 'storage_provider': 'local', 'base_path': '/tmp', 'bucket_name': 'purge_test' } self.config = MedusaConfig(storage=_namedtuple_from_dict( StorageConfig, config['storage']), monitoring={}, cassandra=None, ssh=None, restore=None) self.storage = Storage(config=self.config.storage)
def download_data(storageconfig, backup, fqtns_to_restore, destination): storage = Storage(config=storageconfig) manifest = json.loads(backup.manifest) for section in manifest: fqtn = "{}.{}".format(section['keyspace'], section['columnfamily']) dst = destination / section['keyspace'] / section['columnfamily'] srcs = [ '{}{}'.format( storage.storage_driver.get_path_prefix(backup.data_path), obj['path']) for obj in section['objects'] ] if len(srcs) > 0 and (len(fqtns_to_restore) == 0 or fqtn in fqtns_to_restore): logging.debug('Downloading %s files to %s', len(srcs), dst) dst.mkdir(parents=True) # check for hidden sub-folders in the table directory # (e.g. secondary indices which live in table/.table_idx) dst_subfolders = { dst / src.parent.name for src in map(pathlib.Path, srcs) if src.parent.name.startswith('.') } # create the sub-folders so the downloads actually work for subfolder in dst_subfolders: subfolder.mkdir(parents=False) for src_batch in divide_chunks(srcs, GSUTIL_MAX_FILES_PER_CHUNK): storage.storage_driver.download_blobs(src_batch, dst) elif len(srcs) == 0 and (len(fqtns_to_restore) == 0 or fqtn in fqtns_to_restore): logging.debug('There is nothing to download for {}'.format(fqtn)) else: logging.debug( 'Download of {} was not requested, skipping'.format(fqtn)) logging.info('Downloading backup metadata...') storage.storage_driver.download_blobs(srcs=[ '{}'.format(path) for path in [backup.manifest_path, backup.schema_path, backup.tokenmap_path] ], dest=destination)
def download_cmd(config, backup_name, download_destination, keyspaces, tables, ignore_system_keyspaces): storage = Storage(config=config.storage) if not download_destination.is_dir(): logging.error('{} is not a directory'.format(download_destination)) sys.exit(1) node_backup = storage.get_node_backup(fqdn=storage.config.fqdn, name=backup_name) if not node_backup.exists(): logging.error('No such backup') sys.exit(1) fqtns_to_download, _ = filter_fqtns(keyspaces, tables, node_backup.manifest, ignore_system_keyspaces) download_data(config.storage, node_backup, fqtns_to_download, download_destination)
def status(config, backup_name): storage = Storage(config=config.storage) try: cluster_backup = storage.get_cluster_backup(backup_name) except KeyError: logging.error('No such backup') sys.exit(1) if cluster_backup.is_complete(): print('{.name}'.format(cluster_backup)) else: print('{.name} [Incomplete!]'.format(cluster_backup)) started = datetime.fromtimestamp( cluster_backup.started).strftime(TIMESTAMP_FORMAT) if cluster_backup.finished is None: print('- Started: {}, ' 'Finished: never'.format(started)) else: finished = datetime.fromtimestamp( cluster_backup.finished).strftime(TIMESTAMP_FORMAT) print('- Started: {}, ' 'Finished: {}'.format(started, finished)) complete_nodes = cluster_backup.complete_nodes() incomplete_nodes = cluster_backup.incomplete_nodes() missing_nodes = cluster_backup.missing_nodes() print('- {0} nodes completed, {1} nodes incomplete, {2} nodes missing'. format(len(complete_nodes), len(incomplete_nodes), len(missing_nodes))) if len(incomplete_nodes) > 0: print('- Incomplete nodes:') for node_backup in incomplete_nodes: print(' {}'.format(node_backup.fqdn)) if len(missing_nodes) > 0: print('- Missing nodes:') for fqdn in missing_nodes: print(' {}'.format(fqdn)) print('- {} files, {}'.format(cluster_backup.num_objects(), format_bytes_str(cluster_backup.size())))
def setUp(self): config = configparser.ConfigParser(interpolation=None) config['storage'] = { 'host_file_separator': ',', 'storage_provider': 'local', 'base_path': '/tmp', 'bucket_name': 'purge_test', 'fqdn': 'node1' } self.config = MedusaConfig( file_path=None, storage=_namedtuple_from_dict(StorageConfig, config['storage']), monitoring={}, cassandra=None, ssh=None, checks=None, logging=None, grpc=None, kubernetes=None, ) self.storage = Storage(config=self.config.storage)
def _i_can_download_the_backup_single_table_successfully( context, backup_name, fqtn): def cleanup(temp_path): if os.path.exists(temp_path) and os.path.isdir(temp_path): shutil.rmtree(temp_path) storage = Storage(config=context.medusa_config.storage) config = context.medusa_config download_path = os.path.join("/tmp", "medusa-download-one-table/") cleanup(download_path) os.makedirs(download_path) backup = storage.get_node_backup( fqdn=config.storage.fqdn, name=backup_name, ) medusa.download.download_data(context.medusa_config.storage, backup, fqtn, Path(download_path)) sys_dist = os.path.join(download_path, 'system_distributed') assert os.path.isdir(sys_dist) cleanup(download_path)
def _the_backup_named_backupname_is_present_in_the_index(context, backup_name): storage = Storage(config=context.medusa_config.storage) fqdn = context.medusa_config.storage.fqdn path = os.path.join("{}index/backup_index".format(storage.prefix_path), backup_name, "tokenmap_{}.json".format(fqdn)) tokenmap_from_index = storage.storage_driver.get_blob_content_as_string( path) path = os.path.join(storage.prefix_path + fqdn, backup_name, "meta", "tokenmap.json") tokenmap_from_backup = storage.storage_driver.get_blob_content_as_string( path) # Check that we have the manifest as well there manifest_path = os.path.join( "{}index/backup_index".format(storage.prefix_path), backup_name, "manifest_{}.json".format(fqdn)) manifest_from_index = storage.storage_driver.get_blob_content_as_string( manifest_path) path = os.path.join(storage.prefix_path + fqdn, backup_name, "meta", "manifest.json") manifest_from_backup = storage.storage_driver.get_blob_content_as_string( path) assert (tokenmap_from_backup == tokenmap_from_index and manifest_from_backup == manifest_from_index)
def delete_backup(config, backup_names, all_nodes): monitoring = Monitoring(config=config.monitoring) try: storage = Storage(config=config.storage) cluster_backups = storage.list_cluster_backups() backups_to_purge = backups_to_purge_by_name(storage, cluster_backups, backup_names, all_nodes) logging.info('Deleting Backup(s) {}...'.format(",".join(backup_names))) purge_backups(storage, backups_to_purge, config.storage.backup_grace_period_in_days, storage.config.fqdn) logging.debug('Emitting metrics') tags = ['medusa-node-backup', 'delete-error', 'DELETE-ERROR'] monitoring.send(tags, 0) except Exception as e: tags = ['medusa-node-backup', 'delete-error', 'DELETE-ERROR'] monitoring.send(tags, 1) medusa.utils.handle_exception( e, 'This error happened during the delete of backup(s) "{}": {}'. format(",".join(backup_names), str(e)), config)
def report_latest(config, push_metrics): MAX_RETRIES = 3 SLEEP_TIME = 15 retry = 0 monitoring = Monitoring(config=config.monitoring) for retry in range(MAX_RETRIES): try: logging.debug('Trying to report about existing backups ({}/{})...'.format( retry + 1, MAX_RETRIES )) storage = Storage(config=config.storage) fqdn = config.storage.fqdn backup_index = storage.list_backup_index_blobs() check_node_backup(config, storage, fqdn, push_metrics, monitoring) check_complete_cluster_backup(storage, push_metrics, monitoring, backup_index) check_latest_cluster_backup(storage, push_metrics, monitoring, backup_index) break except Exception as e: if (retry + 1) < MAX_RETRIES: logging.debug('Report attempt {} failed, waiting {} seconds to retry'.format( retry + 1, SLEEP_TIME )) time.sleep(SLEEP_TIME) continue else: logging.error('This error happened during the check: {}'.format(e), exc_info=True) if push_metrics: # Set latest known complete backup to ~ 10 years ago to attract the attention # of the operator on the broken monitoring. logging.info("Sending a big value to 'seconds-since-backup' metric to trigger alerts.") long_time_flag_value = 315365400 tags = ['medusa-cluster-backup', 'seconds-since-backup', 'TRACKING-ERROR'] monitoring.send(tags, long_time_flag_value)