def __check_rse_usage(rse, rse_id): """ Internal method to check RSE usage and limits. :param rse_id: the rse name. :param rse_id: the rse id. :returns : max_being_deleted_files, needed_free_space, used, free. """ max_being_deleted_files, needed_free_space, used, free = None, None, None, None # Get RSE limits limits = rse_core.get_rse_limits(rse=rse, rse_id=rse_id) if not limits and 'MinFreeSpace' not in limits and 'MaxBeingDeletedFiles' not in limits: return max_being_deleted_files, needed_free_space, used, free min_free_space = limits.get('MinFreeSpace') max_being_deleted_files = limits.get('MaxBeingDeletedFiles') # Check from which sources to get used and total spaces # Default is storage source_for_total_space, source_for_used_space = 'storage', 'storage' values = get_rse_attribute(rse_id=rse_id, key='sourceForTotalSpace') if values: source_for_total_space = values[0] values = get_rse_attribute(rse_id=rse_id, key='sourceForUsedSpace') if values: source_for_used_space = values[0] logging.debug( 'RSE: %(rse)s, sourceForTotalSpace: %(source_for_total_space)s, ' 'sourceForUsedSpace: %(source_for_used_space)s' % locals()) # Get total and used space usage = rse_core.get_rse_usage(rse=rse, rse_id=rse_id, source=source_for_total_space) if not usage: return max_being_deleted_files, needed_free_space, used, free for var in usage: total, used = var['total'], var['used'] break if source_for_total_space != source_for_used_space: usage = rse_core.get_rse_usage(rse=rse, rse_id=rse_id, source=source_for_used_space) if not usage: return max_being_deleted_files, needed_free_space, None, free for var in usage: used = var['used'] break free = total - used if min_free_space: needed_free_space = min_free_space - free return max_being_deleted_files, needed_free_space, used, free
def get_rse_usage(rse, issuer, source=None, per_account=False, vo='def', session=None): """ get RSE usage information. :param rse: The RSE name. :param issuer: The issuer account. :param source: dictionary of attributes by which the results should be filtered :param vo: The VO to act on. :param session: The database session in use. :returns: List of RSE usage data. """ rse_id = rse_module.get_rse_id(rse=rse, vo=vo, session=session) usages = rse_module.get_rse_usage(rse_id=rse_id, source=source, per_account=per_account, session=session) for u in usages: u['rse'] = rse if 'account_usages' in u: for account_usage in u['account_usages']: account_usage['account'] = account_usage['account'].external return [api_update_return_dict(u, session=session) for u in usages]
def process_output(output, sanity_check=True, compress=True): """Perform post-consistency-check actions. DARK files are put in the quarantined-replica table so that they may be deleted by the Dark Reaper. LOST files are currently ignored. ``output`` should be an ``str`` with the absolute path to the file produced by ``consistency()``. It must maintain its naming convention. If ``sanity_check`` is ``True`` (default) and the number of entries in the output file is deemed excessive, the actions are aborted. If ``compress`` is ``True`` (default), the file is compressed with bzip2 after the actions are successfully performed. """ logger = logging.getLogger('auditor-worker') dark_replicas = [] try: with open(output) as f: for line in f: label, path = line.rstrip().split(',', 1) if label == 'DARK': scope, name = guess_replica_info(path) dark_replicas.append({ 'path': path, 'scope': scope, 'name': name }) elif label == 'LOST': # TODO: Declare LOST files as suspicious. pass else: raise ValueError('unexpected label') # Since the file is read immediately after its creation, any error # exposes a bug in the Auditor. except Exception as error: logger.critical('Error processing "%s"', output, exc_info=True) raise error rse = os.path.basename(output[:output.rfind('_')]) usage = get_rse_usage(rse, source='rucio')[0] threshold = config.config_get('auditor', 'threshold', False, 0.2) # Perform a basic sanity check by comparing the number of entries # with the total number of files on the RSE. If the percentage is # significant, there is most likely an issue with the site dump. if sanity_check and len(dark_replicas) > threshold * usage['files']: raise AssertionError('number of DARK files is exceeding threshold') add_quarantined_replicas(rse, dark_replicas) logger.debug('Processed %d DARK files from "%s"', len(dark_replicas), output) if compress: destination = bz2_compress_file(output) logger.debug('Compressed "%s"', destination)
def get_rse_usage(rse, issuer, source=None): """ get RSE usage information. :param rse: The RSE name. :param issuer: The issuer account. :param source: dictionary of attributes by which the results should be filtered :returns: True if successful, otherwise false. """ return rse_module.get_rse_usage(rse=rse, source=source)
def test_abacus_rse(self): """ ABACUS (RSE): Test update of RSE usage. """ # Get RSE usage of all sources self.session.query(models.UpdatedRSECounter).delete() # pylint: disable=no-member self.session.query(models.RSEUsage).delete() # pylint: disable=no-member self.session.commit() # pylint: disable=no-member # Upload files -> RSE usage should increase self.files = [{'did_scope': self.scope, 'did_name': 'file_' + generate_uuid(), 'path': file_generator(size=self.file_sizes), 'rse': self.rse, 'lifetime': -1} for i in range(0, 2)] self.upload_client.upload(self.files) [os.remove(file['path']) for file in self.files] rse.run(once=True) rse_usage = get_rse_usage(rse_id=self.rse_id)[0] assert rse_usage['used'] == len(self.files) * self.file_sizes rse_usage_from_rucio = get_rse_usage(rse_id=self.rse_id, source='rucio')[0] assert rse_usage_from_rucio['used'] == len(self.files) * self.file_sizes rse_usage_from_unavailable = get_rse_usage(rse_id=self.rse_id, source='unavailable') assert len(rse_usage_from_unavailable) == 0 # Delete files -> rse usage should decrease from rucio.daemons.reaper.reaper import REGION REGION.invalidate() cleaner.run(once=True) if self.vo: reaper.run(once=True, include_rses='vo=%s&(%s)' % (self.vo['vo'], self.rse), greedy=True) else: reaper.run(once=True, include_rses=self.rse, greedy=True) rse.run(once=True) rse_usage = get_rse_usage(rse_id=self.rse_id)[0] assert rse_usage['used'] == 0 rse_usage_from_rucio = get_rse_usage(rse_id=self.rse_id, source='rucio')[0] assert rse_usage_from_rucio['used'] == 0 rse_usage_from_unavailable = get_rse_usage(rse_id=self.rse_id, source='unavailable') assert len(rse_usage_from_unavailable) == 0
def get_rse_usage(rse, issuer, source=None, per_account=False): """ get RSE usage information. :param rse: The RSE name. :param issuer: The issuer account. :param source: dictionary of attributes by which the results should be filtered :returns: True if successful, otherwise false. """ rse_id = rse_module.get_rse_id(rse=rse) usages = rse_module.get_rse_usage(rse_id=rse_id, source=source, per_account=per_account) return [api_update_return_dict(u) for u in usages]
def group_space(site): """ groupdisks of given site contributing to primaries """ site_groupdisks = [] group_total = 0 try: site_groupdisks = parse_expression('site=%s&spacetoken=ATLASDATADISK&type=GROUPDISK' % site) except: return group_total for rse in site_groupdisks: used = get_rse_usage(rse_id=rse['id'], source='rucio')[0]['used'] group_total += used return group_total
def get_rse_usage(rse, issuer, source=None, per_account=False, vo='def'): """ get RSE usage information. :param rse: The RSE name. :param issuer: The issuer account. :param source: dictionary of attributes by which the results should be filtered :param vo: The VO to act on. :returns: True if successful, otherwise false. """ rse_id = rse_module.get_rse_id(rse=rse, vo=vo) usages = rse_module.get_rse_usage(rse_id=rse_id, source=source, per_account=per_account) for u in usages: if 'account_usages' in u: for account_usage in u['account_usages']: account_usage['account'] = account_usage['account'].external return [api_update_return_dict(u) for u in usages]
def __check_rse_usage(rse, rse_id): """ Internal method to check RSE usage and limits. :param rse_id: the rse name. :param rse_id: the rse id. :returns : max_being_deleted_files, needed_free_space, used, free. """ max_being_deleted_files, needed_free_space, used, free = None, None, None, None # Get RSE limits limits = rse_core.get_rse_limits(rse=rse, rse_id=rse_id) if not limits and 'MinFreeSpace' not in limits and 'MaxBeingDeletedFiles' not in limits: return max_being_deleted_files, needed_free_space, used, free min_free_space = limits.get('MinFreeSpace') max_being_deleted_files = limits.get('MaxBeingDeletedFiles') # Get total space available usage = rse_core.get_rse_usage(rse=rse, rse_id=rse_id, source='srm') if not usage: return max_being_deleted_files, needed_free_space, used, free for u in usage: total = u['total'] break # Get current used space cnt = get_counter(rse_id=rse_id) if not cnt: return max_being_deleted_files, needed_free_space, used, free used = cnt['bytes'] # Get current amount of bytes and files waiting for deletion # being_deleted = rse_core.get_sum_count_being_deleted(rse_id=rse_id) free = total - used needed_free_space = min_free_space - free return max_being_deleted_files, needed_free_space, used, free
def test_abacus_rse(self): """ ABACUS (RSE): Test update of RSE usage. """ # Get RSE usage of all sources self.session.query(models.UpdatedRSECounter).delete() # pylint: disable=no-member self.session.query(models.RSEUsage).delete() # pylint: disable=no-member self.session.commit() # pylint: disable=no-member # Upload files -> RSE usage should increase self.files = [{ 'did_scope': self.scope, 'did_name': 'file_' + generate_uuid(), 'path': file_generator(size=self.file_sizes), 'rse': self.rse, 'lifetime': -1 } for i in range(0, 2)] self.upload_client.upload(self.files) [os.remove(file['path']) for file in self.files] rse.run(once=True) rse_usage = get_rse_usage(rse_id=self.rse_id)[0] assert_equal(rse_usage['used'], len(self.files) * self.file_sizes) rse_usage_from_rucio = get_rse_usage(rse_id=self.rse_id, source='rucio')[0] assert_equal(rse_usage_from_rucio['used'], len(self.files) * self.file_sizes) rse_usage_from_unavailable = get_rse_usage(rse_id=self.rse_id, source='unavailable') assert_equal(len(rse_usage_from_unavailable), 0) # Delete files -> rse usage should decrease cleaner.run(once=True) reaper.run(once=True, rses=[self.rse], greedy=True) rse.run(once=True) rse_usage = get_rse_usage(rse_id=self.rse_id)[0] assert_equal(rse_usage['used'], 0) rse_usage_from_rucio = get_rse_usage(rse_id=self.rse_id, source='rucio')[0] assert_equal(rse_usage_from_rucio['used'], 0) rse_usage_from_unavailable = get_rse_usage(rse_id=self.rse_id, source='unavailable') assert_equal(len(rse_usage_from_unavailable), 0)
def test_bb8_full_workflow(vo, root_account, jdoe_account, rse_factory, mock_scope, did_factory): """BB8: Test the rebalance rule method""" config_core.set(section='bb8', option='allowed_accounts', value='jdoe') tot_rses = 4 rses = [rse_factory.make_posix_rse() for _ in range(tot_rses)] rse1, rse1_id = rses[0] rse2, rse2_id = rses[1] rse3, rse3_id = rses[2] rse4, rse4_id = rses[3] # Add Tags # RSE 1 and 2 nmatch expression T1=true # RSE 3 and 4 nmatch expression T2=true T1 = tag_generator() T2 = tag_generator() add_rse_attribute(rse1_id, T1, True) add_rse_attribute(rse2_id, T1, True) add_rse_attribute(rse3_id, T2, True) add_rse_attribute(rse4_id, T2, True) # Add fake weights add_rse_attribute(rse1_id, "fakeweight", 10) add_rse_attribute(rse2_id, "fakeweight", 0) add_rse_attribute(rse3_id, "fakeweight", 0) add_rse_attribute(rse4_id, "fakeweight", 0) add_rse_attribute(rse1_id, "freespace", 1) add_rse_attribute(rse2_id, "freespace", 1) add_rse_attribute(rse3_id, "freespace", 1) add_rse_attribute(rse4_id, "freespace", 1) # Add quota set_local_account_limit(jdoe_account, rse1_id, -1) set_local_account_limit(jdoe_account, rse2_id, -1) set_local_account_limit(jdoe_account, rse3_id, -1) set_local_account_limit(jdoe_account, rse4_id, -1) set_local_account_limit(root_account, rse1_id, -1) set_local_account_limit(root_account, rse2_id, -1) set_local_account_limit(root_account, rse3_id, -1) set_local_account_limit(root_account, rse4_id, -1) # Invalid the cache because the result of parse_expression is cached REGION.invalidate() tot_datasets = 4 # Create a list of datasets datasets = [did_factory.make_dataset() for _ in range(tot_datasets)] dsn = [dataset['name'] for dataset in datasets] rules = list() base_unit = 100000000000 nb_files1 = 7 nb_files2 = 5 nb_files3 = 3 nb_files4 = 2 file_size = 1 * base_unit rule_to_rebalance = None # Add one secondary file files = create_files(1, mock_scope, rse1_id, bytes_=1) add_rule(dids=[{ 'scope': mock_scope, 'name': files[0]['name'] }], account=jdoe_account, copies=1, rse_expression=rse1, grouping='DATASET', weight=None, lifetime=-86400, locked=False, subscription_id=None)[0] for cnt in range(3, tot_rses): add_replicas(rses[cnt][1], files, jdoe_account) add_rule(dids=[{ 'scope': mock_scope, 'name': files[0]['name'] }], account=jdoe_account, copies=1, rse_expression=rses[cnt][0], grouping='DATASET', weight=None, lifetime=-86400, locked=False, subscription_id=None)[0] rule_cleaner(once=True) # Create dataset 1 of 800 GB and create a rule on RSE 1 and RSE 3 files = create_files(nb_files1, mock_scope, rse1_id, bytes_=file_size) attach_dids(mock_scope, dsn[0], files, jdoe_account) rule_id = add_rule(dids=[{ 'scope': mock_scope, 'name': dsn[0] }], account=jdoe_account, copies=1, rse_expression=rse1, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None)[0] rules.append(rule_id) add_replicas(rse3_id, files, jdoe_account) rule_id = add_rule(dids=[{ 'scope': mock_scope, 'name': dsn[0] }], account=jdoe_account, copies=1, rse_expression=rse3, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None)[0] rules.append(rule_id) # Create dataset 2 of 500 GB and create a rule on RSE 1 and RSE 2 files = create_files(nb_files2, mock_scope, rse1_id, bytes_=file_size) attach_dids(mock_scope, dsn[1], files, jdoe_account) rule_id = add_rule(dids=[{ 'scope': mock_scope, 'name': dsn[1] }], account=jdoe_account, copies=1, rse_expression=rse1, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None)[0] rules.append(rule_id) add_replicas(rse2_id, files, jdoe_account) rule_id = add_rule(dids=[{ 'scope': mock_scope, 'name': dsn[1] }], account=jdoe_account, copies=1, rse_expression=rse2, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None)[0] rules.append(rule_id) # Create dataset 3 of 300 GB and create a rule on RSE 1. The copy on RSE 3 is secondary files = create_files(nb_files3, mock_scope, rse1_id, bytes_=file_size) attach_dids(mock_scope, dsn[2], files, jdoe_account) rule_id = add_rule(dids=[{ 'scope': mock_scope, 'name': dsn[2] }], account=jdoe_account, copies=1, rse_expression=rse1, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None)[0] rule_to_rebalance = rule_id rules.append(rule_id) add_replicas(rse3_id, files, jdoe_account) rule_id = add_rule(dids=[{ 'scope': mock_scope, 'name': dsn[2] }], account=jdoe_account, copies=1, rse_expression=rse3, grouping='DATASET', weight=None, lifetime=-86400, locked=False, subscription_id=None)[0] rule_cleaner(once=True) try: rule = get_rule(rule_id) except: pytest.raises(RuleNotFound, get_rule, rule_id) # Create dataset 4 of 200 GB and create a rule on RSE 3. The copy on RSE 2 is secondary files = create_files(nb_files4, mock_scope, rse3_id, bytes_=file_size) attach_dids(mock_scope, dsn[3], files, jdoe_account) rule_id = add_rule(dids=[{ 'scope': mock_scope, 'name': dsn[3] }], account=jdoe_account, copies=1, rse_expression=rse3, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None)[0] rules.append(rule_id) add_replicas(rse2_id, files, jdoe_account) rule_id = add_rule(dids=[{ 'scope': mock_scope, 'name': dsn[3] }], account=jdoe_account, copies=1, rse_expression=rse2, grouping='DATASET', weight=None, lifetime=-86400, locked=False, subscription_id=None)[0] rule_cleaner(once=True) try: rule = get_rule(rule_id) except: pytest.raises(RuleNotFound, get_rule, rule_id) for dataset in dsn: set_status(mock_scope, dataset, open=False) for rse in rses: fill_rse_expired(rse[1]) set_rse_usage(rse_id=rse[1], source='min_free_space', used=2 * base_unit, free=2 * base_unit, session=None) set_rse_usage(rse_id=rse[1], source='storage', used=15 * base_unit, free=2 * base_unit, session=None) set_rse_usage(rse_id=rse2_id, source='min_free_space', used=1 * base_unit, free=1 * base_unit, session=None) set_rse_usage(rse_id=rse2_id, source='storage', used=6 * base_unit, free=5 * base_unit, session=None) run_abacus(once=True, threads=1, fill_history_table=False, sleep_time=10) # Summary : # RSE 1 : 1500 GB primary + 1 B secondary tot_space = [ src for src in get_rse_usage(rse1_id) if src['source'] == 'rucio' ][0] expired = [ src for src in get_rse_usage(rse1_id) if src['source'] == 'expired' ][0] assert tot_space['used'] == (nb_files1 + nb_files2 + nb_files3) * file_size + 1 assert expired['used'] == 1 # RSE 2 : 500 GB primary + 100 GB secondary tot_space = [ src for src in get_rse_usage(rse2_id) if src['source'] == 'rucio' ][0] expired = [ src for src in get_rse_usage(rse2_id) if src['source'] == 'expired' ][0] assert tot_space['used'] == (nb_files2 + nb_files4) * file_size assert expired['used'] == nb_files4 * file_size # Total primary on T1=true : 2000 GB # Total secondary on T1=true : 200 GB # Ratio secondary / primary = 10 % # Ratio on RSE 1 : 0 % # Ratio on RSE 2 : 40 % # Now run BB8 re_evaluator(once=True, sleep_time=30, did_limit=100) bb8_run(once=True, rse_expression='%s=true' % str(T1), move_subscriptions=False, use_dump=False, sleep_time=300, threads=1, dry_run=False) for rule_id in rules: rule = get_rule(rule_id) if rule_id != rule_to_rebalance: assert (rule['child_rule_id'] is None) else: assert (rule['child_rule_id'] is not None) assert ( rule['expires_at'] <= datetime.utcnow() + timedelta(seconds=1) ) # timedelta needed to prevent failure due to rounding effects child_rule_id = rule['child_rule_id'] child_rule = get_rule(child_rule_id) assert (child_rule['rse_expression'] == rse2) # For teardown, delete child rule update_rule(child_rule_id, {'lifetime': -86400}) rule_cleaner(once=True) for dataset in dsn: set_metadata(mock_scope, dataset, 'lifetime', -86400) undertaker.run(once=True)
def rule_rebalancer(rse_expression, move_subscriptions=False, use_dump=False, sleep_time=300, once=True, dry_run=False): """ Main loop to rebalancer rules automatically """ total_rebalance_volume = 0 executable = 'rucio-bb8' hostname = socket.gethostname() pid = os.getpid() hb_thread = threading.current_thread() heart_beat = live(executable, hostname, pid, hb_thread) prepend_str = 'bb8[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.DEBUG, 'rse_expression: %s', rse_expression) logger(logging.INFO, 'BB8 started') while not GRACEFUL_STOP.is_set(): logger(logging.INFO, 'Starting new cycle') heart_beat = live(executable, hostname, pid, hb_thread) start_time = time.time() total_rebalance_volume = 0 tolerance = config_core.get('bb8', 'tolerance', default=0.05) max_total_rebalance_volume = config_core.get('bb8', 'max_total_rebalance_volume', default=10 * 1E12) max_rse_rebalance_volume = config_core.get('bb8', 'max_rse_rebalance_volume', default=500 * 1E9) min_total = config_core.get('bb8', 'min_total', default=20 * 1E9) payload_cnt = list_payload_counts(executable, older_than=600, hash_executable=None, session=None) if rse_expression in payload_cnt: logger(logging.WARNING, 'One BB8 instance already running with the same RSE expression. Stopping') break else: # List the RSEs represented by rse_expression try: rses = [rse for rse in parse_expression(rse_expression)] list_rses2 = [rse['rse'] for rse in rses] except InvalidRSEExpression as err: logger(logging.ERROR, err) break # List the RSEs represented by all the RSE expressions stored in heartbeat payload list_rses1 = [] for rse_exp in payload_cnt: if rse_exp: list_rses1 = [rse['rse'] for rse in parse_expression(rse_exp)] for rse in list_rses2: if rse in list_rses1: logger(logging.WARNING, 'Overlapping RSE expressions %s vs %s. Stopping', rse_exp, rse_expression) break logger(logging.INFO, 'Will process rebalancing on %s', rse_expression) heart_beat = live(executable, hostname, pid, hb_thread, older_than=max(600, sleep_time), hash_executable=None, payload=rse_expression, session=None) total_primary = 0 total_secondary = 0 total_total = 0 global_ratio = float(0) for rse in rses: logger(logging.DEBUG, 'Getting RSE usage on %s', rse['rse']) rse_usage = get_rse_usage(rse_id=rse['id']) usage_dict = {} for item in rse_usage: # TODO Check last update usage_dict[item['source']] = {'used': item['used'], 'free': item['free'], 'total': item['total']} try: rse['primary'] = usage_dict['rucio']['used'] - usage_dict['expired']['used'] rse['secondary'] = usage_dict['expired']['used'] rse['total'] = usage_dict['storage']['total'] - usage_dict['min_free_space']['used'] rse['ratio'] = float(rse['primary']) / float(rse['total']) except KeyError as err: logger(logging.ERROR, 'Missing source usage %s for RSE %s. Exiting', err, rse['rse']) break total_primary += rse['primary'] total_secondary += rse['secondary'] total_total += float(rse['total']) rse['receive_volume'] = 0 # Already rebalanced volume in this run global_ratio = float(total_primary) / float(total_total) logger(logging.INFO, 'Global ratio: %f' % (global_ratio)) for rse in sorted(rses, key=lambda k: k['ratio']): logger(logging.INFO, '%s Sec/Prim local ratio (%f) vs global %s', rse['rse'], rse['ratio'], global_ratio) rses_over_ratio = sorted([rse for rse in rses if rse['ratio'] > global_ratio + global_ratio * tolerance], key=lambda k: k['ratio'], reverse=True) rses_under_ratio = sorted([rse for rse in rses if rse['ratio'] < global_ratio - global_ratio * tolerance], key=lambda k: k['ratio'], reverse=False) # Excluding RSEs logger(logging.DEBUG, 'Excluding RSEs as destination which are too small by size:') for des in rses_under_ratio: if des['total'] < min_total: logger(logging.DEBUG, 'Excluding %s', des['rse']) rses_under_ratio.remove(des) logger(logging.DEBUG, 'Excluding RSEs as sources which are too small by size:') for src in rses_over_ratio: if src['total'] < min_total: logger(logging.DEBUG, 'Excluding %s', src['rse']) rses_over_ratio.remove(src) logger(logging.DEBUG, 'Excluding RSEs as destinations which are not available for write:') for des in rses_under_ratio: if des['availability'] & 2 == 0: logger(logging.DEBUG, 'Excluding %s', des['rse']) rses_under_ratio.remove(des) logger(logging.DEBUG, 'Excluding RSEs as sources which are not available for read:') for src in rses_over_ratio: if src['availability'] & 4 == 0: logger(logging.DEBUG, 'Excluding %s', src['rse']) rses_over_ratio.remove(src) # Gets the number of active transfers per location dict_locks = get_active_locks(session=None) # Loop over RSEs over the ratio for index, source_rse in enumerate(rses_over_ratio): # The volume that would be rebalanced, not real availability of the data: available_source_rebalance_volume = int((source_rse['primary'] - global_ratio * source_rse['secondary']) / (global_ratio + 1)) if available_source_rebalance_volume > max_rse_rebalance_volume: available_source_rebalance_volume = max_rse_rebalance_volume if available_source_rebalance_volume > max_total_rebalance_volume - total_rebalance_volume: available_source_rebalance_volume = max_total_rebalance_volume - total_rebalance_volume # Select a target: for destination_rse in rses_under_ratio: if available_source_rebalance_volume > 0: vo_str = ' on VO {}'.format(destination_rse['vo']) if destination_rse['vo'] != 'def' else '' if index == 0 and destination_rse['id'] in dict_locks: replicating_volume = dict_locks[destination_rse['id']]['bytes'] logger(logging.DEBUG, 'Already %f TB replicating to %s%s', replicating_volume / 1E12, destination_rse['rse'], vo_str) destination_rse['receive_volume'] += replicating_volume if destination_rse['receive_volume'] >= max_rse_rebalance_volume: continue available_target_rebalance_volume = max_rse_rebalance_volume - destination_rse['receive_volume'] if available_target_rebalance_volume >= available_source_rebalance_volume: available_target_rebalance_volume = available_source_rebalance_volume logger(logging.INFO, 'Rebalance %d TB from %s(%f) to %s(%f)%s', available_target_rebalance_volume / 1E12, source_rse['rse'], source_rse['ratio'], destination_rse['rse'], destination_rse['ratio'], vo_str) expr = destination_rse['rse'] rebalance_rse(rse_id=source_rse['id'], max_bytes=available_target_rebalance_volume, dry_run=dry_run, comment='Background rebalancing', force_expression=expr, logger=logger) destination_rse['receive_volume'] += available_target_rebalance_volume total_rebalance_volume += available_target_rebalance_volume available_source_rebalance_volume -= available_target_rebalance_volume if once: break end_time = time.time() time_diff = end_time - start_time if time_diff < sleep_time: logger(logging.INFO, 'Sleeping for a while : %f seconds', sleep_time - time_diff) GRACEFUL_STOP.wait(sleep_time - time_diff) die(executable='rucio-bb8', hostname=hostname, pid=pid, thread=hb_thread)
tolerance = 0.15 max_total_rebalance_volume = 200 * 1E12 max_rse_rebalance_volume = 20 * 1E12 min_total = 50 * 1E12 total_rebalance_volume = 0 # Calculate the current ratios rses = parse_expression( "(datapolicynucleus=1|tier=1)&type=DATADISK\\bb8-enabled=0") total_primary = 0 total_secondary = 0 total_total = 0 global_ratio = float(0) for rse in rses: rse['primary'] = get_rse_usage( rse=None, rse_id=rse['id'], source='rucio')[0]['used'] - get_rse_usage( rse=None, rse_id=rse['id'], source='expired')[0]['used'] rse['secondary'] = get_rse_usage(rse=None, rse_id=rse['id'], source='expired')[0]['used'] rse['total'] = get_rse_usage( rse=None, rse_id=rse['id'], source='storage')[0]['total'] - get_rse_usage( rse=None, rse_id=rse['id'], source='min_free_space')[0]['used'] rse['ratio'] = float(rse['primary']) / float(rse['total']) total_primary += rse['primary'] total_secondary += rse['secondary'] total_total += float(rse['total']) rse['receive_volume'] = 0 # Already rebalanced volume in this run global_ratio = float(total_primary) / float(total_total)
def __check_rse_usage(rse, rse_id, prepend_str): """ Internal method to check RSE usage and limits. :param rse_id: the rse name. :param rse_id: the rse id. :returns : max_being_deleted_files, needed_free_space, used, free. """ result = REGION.get('rse_usage_%s' % rse_id) if result is NO_VALUE: max_being_deleted_files, needed_free_space, used, free, obsolete = None, None, None, None, None # Get RSE limits limits = get_rse_limits(rse_id=rse_id) if not limits and 'MinFreeSpace' not in limits and 'MaxBeingDeletedFiles' not in limits: result = (max_being_deleted_files, needed_free_space, used, free) REGION.set('rse_usage_%s' % rse_id, result) return result min_free_space = limits.get('MinFreeSpace') max_being_deleted_files = limits.get('MaxBeingDeletedFiles') # Check from which sources to get used and total spaces # Default is storage attributes = list_rse_attributes(rse_id=rse_id) source_for_total_space = attributes.get('sourceForTotalSpace', 'storage') source_for_used_space = attributes.get('sourceForUsedSpace', 'storage') greedy = attributes.get('greedyDeletion', False) logging.debug( '%s RSE: %s, source_for_total_space: %s, source_for_used_space: %s', prepend_str, rse, source_for_total_space, source_for_used_space) # First of all check if greedy mode is enabled for this RSE if greedy: result = (max_being_deleted_files, 1000000000000, used, free) REGION.set('rse_usage_%s' % rse_id, result) return result # Get total, used and obsolete space rse_usage = get_rse_usage(rse_id=rse_id) usage = [entry for entry in rse_usage if entry['source'] == 'obsolete'] for var in usage: obsolete = var['used'] break usage = [ entry for entry in rse_usage if entry['source'] == source_for_total_space ] # If no information is available about disk space, do nothing except if there are replicas with Epoch tombstone if not usage: if not obsolete: result = (max_being_deleted_files, needed_free_space, used, free) REGION.set('rse_usage_%s' % rse_id, result) return result result = (max_being_deleted_files, obsolete, used, free) REGION.set('rse_usage_%s' % rse_id, result) return result # Extract the total and used space for var in usage: total, used = var['total'], var['used'] break if source_for_total_space != source_for_used_space: usage = [ entry for entry in rse_usage if entry['source'] == source_for_used_space ] if not usage: result = (max_being_deleted_files, needed_free_space, None, free) REGION.set('rse_usage_%s' % rse_id, result) return result for var in usage: used = var['used'] break free = total - used if min_free_space: needed_free_space = min_free_space - free # If needed_free_space negative, nothing to delete except if some Epoch tombstoned replicas if needed_free_space <= 0: needed_free_space = 0 or obsolete result = (max_being_deleted_files, needed_free_space, used, free) REGION.set('rse_usage_%s' % rse_id, result) return result logging.debug('%s Using cached value for RSE usage on RSE %s', prepend_str, rse) return result
def process_output(output, sanity_check=True, compress=True): """Perform post-consistency-check actions. DARK files are put in the quarantined-replica table so that they may be deleted by the Dark Reaper. LOST files are reported as suspicious so that they may be further checked by the cloud squads. ``output`` should be an ``str`` with the absolute path to the file produced by ``consistency()``. It must maintain its naming convention. If ``sanity_check`` is ``True`` (default) and the number of entries in the output file is deemed excessive, the actions are aborted. If ``compress`` is ``True`` (default), the file is compressed with bzip2 after the actions are successfully performed. """ logger = logging.getLogger('auditor-worker') dark_replicas = [] lost_replicas = [] try: with open(output) as f: for line in f: label, path = line.rstrip().split(',', 1) scope, name = guess_replica_info(path) if label == 'DARK': dark_replicas.append({'path': path, 'scope': InternalScope(scope), 'name': name}) elif label == 'LOST': lost_replicas.append({'scope': InternalScope(scope), 'name': name}) else: raise ValueError('unexpected label') # Since the file is read immediately after its creation, any error # exposes a bug in the Auditor. except Exception as error: logger.critical('Error processing "%s"', output, exc_info=True) raise error rse = os.path.basename(output[:output.rfind('_')]) rse_id = get_rse_id(rse=rse) usage = get_rse_usage(rse_id=rse_id, source='rucio')[0] threshold = config.config_get('auditor', 'threshold', False, 0.2) # Perform a basic sanity check by comparing the number of entries # with the total number of files on the RSE. If the percentage is # significant, there is most likely an issue with the site dump. found_error = False if len(dark_replicas) > threshold * usage['files']: logger.warning('Number of DARK files is exceeding threshold: "%s"', output) found_error = True if len(lost_replicas) > threshold * usage['files']: logger.warning('Number of LOST files is exceeding threshold: "%s"', output) found_error = True if found_error and sanity_check: raise AssertionError('sanity check failed') # While converting LOST replicas to PFNs, entries that do not # correspond to a replica registered in Rucio are silently dropped. lost_pfns = [r['rses'][rse_id][0] for r in list_replicas(lost_replicas) if rse_id in r['rses']] add_quarantined_replicas(rse_id=rse_id, replicas=dark_replicas) logger.debug('Processed %d DARK files from "%s"', len(dark_replicas), output) declare_bad_file_replicas(lost_pfns, reason='Reported by Auditor', issuer=InternalAccount('root'), status=BadFilesStatus.SUSPICIOUS) logger.debug('Processed %d LOST files from "%s"', len(lost_replicas), output) if compress: destination = bz2_compress_file(output) logger.debug('Compressed "%s"', destination)
from rucio.db.sqla.constants import RuleState tolerance = 0.1 max_total_rebalance_volume = 200 * 1E12 max_rse_rebalance_volume = 20 * 1E12 total_rebalance_volume = 0 # Calculate the current ratios rses = parse_expression('(datapolicynucleus=1|tier=1)&type=DATADISK') total_primary = 0 total_secondary = 0 global_ratio = float(0) for rse in rses: rse['primary'] = get_rse_usage(rse=None, rse_id=rse['id'], source='rucio')[0]['used'] rse['secondary'] = get_rse_usage(rse=None, rse_id=rse['id'], source='expired')[0]['used'] rse['ratio'] = float(rse['primary']) / float(rse['secondary']) total_primary += rse['primary'] total_secondary += rse['secondary'] rse['receive_volume'] = 0 # Already rebalanced volume in this run global_ratio = float(total_primary) / float(total_secondary) print 'Global ratio: %f' % (global_ratio) for rse in sorted(rses, key=lambda k: k['ratio']): print ' %s (%f)' % (rse['rse'], rse['ratio']) rses_over_ratio = sorted([rse for rse in rses if rse['ratio'] > global_ratio + global_ratio * tolerance], key=lambda k: k['ratio'], reverse=True) rses_under_ratio = sorted([rse for rse in rses if rse['ratio'] < global_ratio - global_ratio * tolerance], key=lambda k: k['ratio'], reverse=False) session = get_session()
def __check_rse_usage(rse: str, rse_id: str, greedy: bool = False, logger: 'Callable' = logging.log) -> 'Tuple[int, bool]': """ Internal method to check RSE usage and limits. :param rse: The RSE name. :param rse_id: The RSE id. :param greedy: If True, needed_free_space will be set to 1TB regardless of actual rse usage. :returns: needed_free_space, only_delete_obsolete. """ result = REGION.get('rse_usage_%s' % rse_id) if result is NO_VALUE: needed_free_space, used, free, obsolete = 0, 0, 0, 0 # First of all check if greedy mode is enabled for this RSE or generally attributes = list_rse_attributes(rse_id=rse_id) rse_attr_greedy = attributes.get('greedyDeletion', False) if greedy or rse_attr_greedy: result = (1000000000000, False) REGION.set('rse_usage_%s' % rse_id, result) return result # Get RSE limits limits = get_rse_limits(rse_id=rse_id) min_free_space = limits.get('MinFreeSpace', 0) # Check from which sources to get used and total spaces # Default is storage source_for_total_space = attributes.get('source_for_total_space', 'storage') source_for_used_space = attributes.get('source_for_used_space', 'storage') logger( logging.DEBUG, 'RSE: %s, source_for_total_space: %s, source_for_used_space: %s', rse, source_for_total_space, source_for_used_space) # Get total, used and obsolete space rse_usage = get_rse_usage(rse_id=rse_id) usage = [entry for entry in rse_usage if entry['source'] == 'obsolete'] for var in usage: obsolete = var['used'] break usage = [ entry for entry in rse_usage if entry['source'] == source_for_total_space ] # If no information is available about disk space, do nothing except if there are replicas with Epoch tombstone if not usage: if not obsolete: result = (needed_free_space, False) REGION.set('rse_usage_%s' % rse_id, result) return result result = (obsolete, True) REGION.set('rse_usage_%s' % rse_id, result) return result # Extract the total and used space for var in usage: total, used = var['total'], var['used'] break if source_for_total_space != source_for_used_space: usage = [ entry for entry in rse_usage if entry['source'] == source_for_used_space ] if not usage: result = (needed_free_space, False) REGION.set('rse_usage_%s' % rse_id, result) return result for var in usage: used = var['used'] break free = total - used if min_free_space: needed_free_space = min_free_space - free # If needed_free_space negative, nothing to delete except if some Epoch tombstoned replicas if needed_free_space <= 0: result = (obsolete, True) else: result = (needed_free_space, False) REGION.set('rse_usage_%s' % rse_id, result) return result return result
def run_once(heartbeat_handler: "HeartbeatHandler", rse_expression: str, move_subscriptions: bool, use_dump: bool, dry_run: bool, **_kwargs) -> bool: must_sleep = False total_rebalance_volume = 0 worker_number, total_workers, logger = heartbeat_handler.live() logger(logging.DEBUG, "Running BB8 on rse_expression: %s", rse_expression) tolerance = config_get_float("bb8", "tolerance", default=0.05) max_total_rebalance_volume = config_get_float("bb8", "max_total_rebalance_volume", default=10 * 1e12) max_rse_rebalance_volume = config_get_float("bb8", "max_rse_rebalance_volume", default=500 * 1e9) min_total = config_get_float("bb8", "min_total", default=20 * 1e9) payload_cnt = list_payload_counts(executable="rucio-bb8", older_than=600, hash_executable=None, session=None) if rse_expression in payload_cnt: logger( logging.WARNING, "One BB8 instance already running with the same RSE expression. Stopping", ) must_sleep = True return must_sleep else: # List the RSEs represented by rse_expression try: rses = [rse for rse in parse_expression(rse_expression)] list_rses2 = [rse["rse"] for rse in rses] except InvalidRSEExpression as err: logger(logging.ERROR, err) return must_sleep # List the RSEs represented by all the RSE expressions stored in heartbeat payload list_rses1 = [] for rse_exp in payload_cnt: if rse_exp: list_rses1 = [rse["rse"] for rse in parse_expression(rse_exp)] for rse in list_rses2: if rse in list_rses1: logger( logging.WARNING, "Overlapping RSE expressions %s vs %s. Stopping", rse_exp, rse_expression, ) return must_sleep logger(logging.INFO, "Will process rebalancing on %s", rse_expression) worker_number, total_workers, logger = heartbeat_handler.live() total_primary = 0 total_secondary = 0 total_total = 0 global_ratio = float(0) for rse in rses: logger(logging.DEBUG, "Getting RSE usage on %s", rse["rse"]) rse_usage = get_rse_usage(rse_id=rse["id"]) usage_dict = {} for item in rse_usage: # TODO Check last update usage_dict[item["source"]] = { "used": item["used"], "free": item["free"], "total": item["total"], } try: rse["primary"] = (usage_dict["rucio"]["used"] - usage_dict["expired"]["used"]) rse["secondary"] = usage_dict["expired"]["used"] rse["total"] = (usage_dict["storage"]["total"] - usage_dict["min_free_space"]["used"]) rse["ratio"] = float(rse["primary"]) / float(rse["total"]) except KeyError as err: logger( logging.ERROR, "Missing source usage %s for RSE %s. Exiting", err, rse["rse"], ) break total_primary += rse["primary"] total_secondary += rse["secondary"] total_total += float(rse["total"]) rse["receive_volume"] = 0 # Already rebalanced volume in this run global_ratio = float(total_primary) / float(total_total) logger(logging.INFO, "Global ratio: %f" % (global_ratio)) for rse in sorted(rses, key=lambda k: k["ratio"]): logger( logging.INFO, "%s Sec/Prim local ratio (%f) vs global %s", rse["rse"], rse["ratio"], global_ratio, ) rses_over_ratio = sorted( [ rse for rse in rses if rse["ratio"] > global_ratio + global_ratio * tolerance ], key=lambda k: k["ratio"], reverse=True, ) rses_under_ratio = sorted( [ rse for rse in rses if rse["ratio"] < global_ratio - global_ratio * tolerance ], key=lambda k: k["ratio"], reverse=False, ) # Excluding RSEs logger(logging.DEBUG, "Excluding RSEs as destination which are too small by size:") for des in rses_under_ratio: if des["total"] < min_total: logger(logging.DEBUG, "Excluding %s", des["rse"]) rses_under_ratio.remove(des) logger(logging.DEBUG, "Excluding RSEs as sources which are too small by size:") for src in rses_over_ratio: if src["total"] < min_total: logger(logging.DEBUG, "Excluding %s", src["rse"]) rses_over_ratio.remove(src) logger( logging.DEBUG, "Excluding RSEs as destinations which are not available for write:", ) for des in rses_under_ratio: if des["availability"] & 2 == 0: logger(logging.DEBUG, "Excluding %s", des["rse"]) rses_under_ratio.remove(des) logger(logging.DEBUG, "Excluding RSEs as sources which are not available for read:") for src in rses_over_ratio: if src["availability"] & 4 == 0: logger(logging.DEBUG, "Excluding %s", src["rse"]) rses_over_ratio.remove(src) # Gets the number of active transfers per location dict_locks = get_active_locks(session=None) # Loop over RSEs over the ratio for index, source_rse in enumerate(rses_over_ratio): # The volume that would be rebalanced, not real availability of the data: available_source_rebalance_volume = int( (source_rse["primary"] - global_ratio * source_rse["secondary"]) / (global_ratio + 1)) if available_source_rebalance_volume > max_rse_rebalance_volume: available_source_rebalance_volume = max_rse_rebalance_volume if (available_source_rebalance_volume > max_total_rebalance_volume - total_rebalance_volume): available_source_rebalance_volume = ( max_total_rebalance_volume - total_rebalance_volume) # Select a target: for destination_rse in rses_under_ratio: if available_source_rebalance_volume > 0: vo_str = (" on VO {}".format(destination_rse["vo"]) if destination_rse["vo"] != "def" else "") if index == 0 and destination_rse["id"] in dict_locks: replicating_volume = dict_locks[ destination_rse["id"]]["bytes"] logger( logging.DEBUG, "Already %f TB replicating to %s%s", replicating_volume / 1e12, destination_rse["rse"], vo_str, ) destination_rse["receive_volume"] += replicating_volume if destination_rse[ "receive_volume"] >= max_rse_rebalance_volume: continue available_target_rebalance_volume = ( max_rse_rebalance_volume - destination_rse["receive_volume"]) if (available_target_rebalance_volume >= available_source_rebalance_volume): available_target_rebalance_volume = ( available_source_rebalance_volume) logger( logging.INFO, "Rebalance %d TB from %s(%f) to %s(%f)%s", available_target_rebalance_volume / 1e12, source_rse["rse"], source_rse["ratio"], destination_rse["rse"], destination_rse["ratio"], vo_str, ) expr = destination_rse["rse"] rebalance_rse( rse_id=source_rse["id"], max_bytes=available_target_rebalance_volume, dry_run=dry_run, comment="Background rebalancing", force_expression=expr, logger=logger, ) destination_rse[ "receive_volume"] += available_target_rebalance_volume total_rebalance_volume += available_target_rebalance_volume available_source_rebalance_volume -= ( available_target_rebalance_volume) must_sleep = True return must_sleep