def main(config, max_backup_age=0, max_backup_count=0): backups_to_purge = list() monitoring = Monitoring(config=config.monitoring) try: logging.info('Starting purge') storage = Storage(config=config.storage) # Get all backups for the local node logging.info('Listing backups for {}'.format(config.storage.fqdn)) backup_index = storage.list_backup_index_blobs() backups = list(storage.list_node_backups(fqdn=config.storage.fqdn, backup_index_blobs=backup_index)) # list all backups to purge based on date conditions backups_to_purge += backups_to_purge_by_age(backups, max_backup_age) # list all backups to purge based on count conditions backups_to_purge += backups_to_purge_by_count(backups, max_backup_count) # purge all candidate backups purge_backups(storage, backups_to_purge) logging.debug('Emitting metrics') tags = ['medusa-node-backup', 'purge-error', 'PURGE-ERROR'] monitoring.send(tags, 0) except Exception as e: traceback.print_exc() tags = ['medusa-node-backup', 'purge-error', 'PURGE-ERROR'] monitoring.send(tags, 1) logging.error('This error happened during the purge: {}'.format(str(e))) sys.exit(1)
def report_latest(config, push_metrics): MAX_RETRIES = 3 SLEEP_TIME = 15 retry = 0 monitoring = Monitoring(config=config.monitoring) for retry in range(MAX_RETRIES): try: logging.debug('Trying to report about existing backups ({}/{})...'.format( retry + 1, MAX_RETRIES )) storage = Storage(config=config.storage) fqdn = config.storage.fqdn backup_index = storage.list_backup_index_blobs() check_node_backup(config, storage, fqdn, push_metrics, monitoring) check_complete_cluster_backup(storage, push_metrics, monitoring, backup_index) check_latest_cluster_backup(storage, push_metrics, monitoring, backup_index) break except Exception as e: if (retry + 1) < MAX_RETRIES: logging.debug('Report attempt {} failed, waiting {} seconds to retry'.format( retry + 1, SLEEP_TIME )) time.sleep(SLEEP_TIME) continue else: logging.error('This error happened during the check: {}'.format(e), exc_info=True) if push_metrics: # Set latest known complete backup to ~ 10 years ago to attract the attention # of the operator on the broken monitoring. logging.info("Sending a big value to 'seconds-since-backup' metric to trigger alerts.") long_time_flag_value = 315365400 tags = ['medusa-cluster-backup', 'seconds-since-backup', 'TRACKING-ERROR'] monitoring.send(tags, long_time_flag_value)