(global_ratio + 1)) if available_source_rebalance_volume > max_rse_rebalance_volume: available_source_rebalance_volume = max_rse_rebalance_volume if available_source_rebalance_volume > max_total_rebalance_volume - total_rebalance_volume: available_source_rebalance_volume = max_total_rebalance_volume - total_rebalance_volume # Select a target: for destination_rse in rses_under_ratio: if available_source_rebalance_volume > 0: if destination_rse['receive_volume'] >= max_rse_rebalance_volume: continue available_target_rebalance_volume = max_rse_rebalance_volume - destination_rse[ 'receive_volume'] if available_target_rebalance_volume >= available_source_rebalance_volume: available_target_rebalance_volume = available_source_rebalance_volume print 'Rebalance %dTB from %s(%f) to %s(%f)' % ( available_target_rebalance_volume / 1E12, source_rse['rse'], source_rse['ratio'], destination_rse['rse'], destination_rse['ratio']) rebalance_rse(source_rse['rse'], max_bytes=available_target_rebalance_volume, dry_run=False, comment='Nuclei Background rebalancing', force_expression=destination_rse['rse']) destination_rse[ 'receive_volume'] += available_target_rebalance_volume total_rebalance_volume += available_target_rebalance_volume available_source_rebalance_volume -= available_target_rebalance_volume
def rule_rebalancer(rse_expression, move_subscriptions=False, use_dump=False, sleep_time=300, once=True, dry_run=False): """ Main loop to rebalancer rules automatically """ total_rebalance_volume = 0 executable = 'rucio-bb8' hostname = socket.gethostname() pid = os.getpid() hb_thread = threading.current_thread() heart_beat = live(executable, hostname, pid, hb_thread) prepend_str = 'bb8[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.DEBUG, 'rse_expression: %s', rse_expression) logger(logging.INFO, 'BB8 started') while not GRACEFUL_STOP.is_set(): logger(logging.INFO, 'Starting new cycle') heart_beat = live(executable, hostname, pid, hb_thread) start_time = time.time() total_rebalance_volume = 0 tolerance = config_core.get('bb8', 'tolerance', default=0.05) max_total_rebalance_volume = config_core.get('bb8', 'max_total_rebalance_volume', default=10 * 1E12) max_rse_rebalance_volume = config_core.get('bb8', 'max_rse_rebalance_volume', default=500 * 1E9) min_total = config_core.get('bb8', 'min_total', default=20 * 1E9) payload_cnt = list_payload_counts(executable, older_than=600, hash_executable=None, session=None) if rse_expression in payload_cnt: logger(logging.WARNING, 'One BB8 instance already running with the same RSE expression. Stopping') break else: # List the RSEs represented by rse_expression try: rses = [rse for rse in parse_expression(rse_expression)] list_rses2 = [rse['rse'] for rse in rses] except InvalidRSEExpression as err: logger(logging.ERROR, err) break # List the RSEs represented by all the RSE expressions stored in heartbeat payload list_rses1 = [] for rse_exp in payload_cnt: if rse_exp: list_rses1 = [rse['rse'] for rse in parse_expression(rse_exp)] for rse in list_rses2: if rse in list_rses1: logger(logging.WARNING, 'Overlapping RSE expressions %s vs %s. Stopping', rse_exp, rse_expression) break logger(logging.INFO, 'Will process rebalancing on %s', rse_expression) heart_beat = live(executable, hostname, pid, hb_thread, older_than=max(600, sleep_time), hash_executable=None, payload=rse_expression, session=None) total_primary = 0 total_secondary = 0 total_total = 0 global_ratio = float(0) for rse in rses: logger(logging.DEBUG, 'Getting RSE usage on %s', rse['rse']) rse_usage = get_rse_usage(rse_id=rse['id']) usage_dict = {} for item in rse_usage: # TODO Check last update usage_dict[item['source']] = {'used': item['used'], 'free': item['free'], 'total': item['total']} try: rse['primary'] = usage_dict['rucio']['used'] - usage_dict['expired']['used'] rse['secondary'] = usage_dict['expired']['used'] rse['total'] = usage_dict['storage']['total'] - usage_dict['min_free_space']['used'] rse['ratio'] = float(rse['primary']) / float(rse['total']) except KeyError as err: logger(logging.ERROR, 'Missing source usage %s for RSE %s. Exiting', err, rse['rse']) break total_primary += rse['primary'] total_secondary += rse['secondary'] total_total += float(rse['total']) rse['receive_volume'] = 0 # Already rebalanced volume in this run global_ratio = float(total_primary) / float(total_total) logger(logging.INFO, 'Global ratio: %f' % (global_ratio)) for rse in sorted(rses, key=lambda k: k['ratio']): logger(logging.INFO, '%s Sec/Prim local ratio (%f) vs global %s', rse['rse'], rse['ratio'], global_ratio) rses_over_ratio = sorted([rse for rse in rses if rse['ratio'] > global_ratio + global_ratio * tolerance], key=lambda k: k['ratio'], reverse=True) rses_under_ratio = sorted([rse for rse in rses if rse['ratio'] < global_ratio - global_ratio * tolerance], key=lambda k: k['ratio'], reverse=False) # Excluding RSEs logger(logging.DEBUG, 'Excluding RSEs as destination which are too small by size:') for des in rses_under_ratio: if des['total'] < min_total: logger(logging.DEBUG, 'Excluding %s', des['rse']) rses_under_ratio.remove(des) logger(logging.DEBUG, 'Excluding RSEs as sources which are too small by size:') for src in rses_over_ratio: if src['total'] < min_total: logger(logging.DEBUG, 'Excluding %s', src['rse']) rses_over_ratio.remove(src) logger(logging.DEBUG, 'Excluding RSEs as destinations which are not available for write:') for des in rses_under_ratio: if des['availability'] & 2 == 0: logger(logging.DEBUG, 'Excluding %s', des['rse']) rses_under_ratio.remove(des) logger(logging.DEBUG, 'Excluding RSEs as sources which are not available for read:') for src in rses_over_ratio: if src['availability'] & 4 == 0: logger(logging.DEBUG, 'Excluding %s', src['rse']) rses_over_ratio.remove(src) # Gets the number of active transfers per location dict_locks = get_active_locks(session=None) # Loop over RSEs over the ratio for index, source_rse in enumerate(rses_over_ratio): # The volume that would be rebalanced, not real availability of the data: available_source_rebalance_volume = int((source_rse['primary'] - global_ratio * source_rse['secondary']) / (global_ratio + 1)) if available_source_rebalance_volume > max_rse_rebalance_volume: available_source_rebalance_volume = max_rse_rebalance_volume if available_source_rebalance_volume > max_total_rebalance_volume - total_rebalance_volume: available_source_rebalance_volume = max_total_rebalance_volume - total_rebalance_volume # Select a target: for destination_rse in rses_under_ratio: if available_source_rebalance_volume > 0: vo_str = ' on VO {}'.format(destination_rse['vo']) if destination_rse['vo'] != 'def' else '' if index == 0 and destination_rse['id'] in dict_locks: replicating_volume = dict_locks[destination_rse['id']]['bytes'] logger(logging.DEBUG, 'Already %f TB replicating to %s%s', replicating_volume / 1E12, destination_rse['rse'], vo_str) destination_rse['receive_volume'] += replicating_volume if destination_rse['receive_volume'] >= max_rse_rebalance_volume: continue available_target_rebalance_volume = max_rse_rebalance_volume - destination_rse['receive_volume'] if available_target_rebalance_volume >= available_source_rebalance_volume: available_target_rebalance_volume = available_source_rebalance_volume logger(logging.INFO, 'Rebalance %d TB from %s(%f) to %s(%f)%s', available_target_rebalance_volume / 1E12, source_rse['rse'], source_rse['ratio'], destination_rse['rse'], destination_rse['ratio'], vo_str) expr = destination_rse['rse'] rebalance_rse(rse_id=source_rse['id'], max_bytes=available_target_rebalance_volume, dry_run=dry_run, comment='Background rebalancing', force_expression=expr, logger=logger) destination_rse['receive_volume'] += available_target_rebalance_volume total_rebalance_volume += available_target_rebalance_volume available_source_rebalance_volume -= available_target_rebalance_volume if once: break end_time = time.time() time_diff = end_time - start_time if time_diff < sleep_time: logger(logging.INFO, 'Sleeping for a while : %f seconds', sleep_time - time_diff) GRACEFUL_STOP.wait(sleep_time - time_diff) die(executable='rucio-bb8', hostname=hostname, pid=pid, thread=hb_thread)
if available_source_rebalance_volume > max_rse_rebalance_volume: available_source_rebalance_volume = max_rse_rebalance_volume if available_source_rebalance_volume > max_total_rebalance_volume - total_rebalance_volume: available_source_rebalance_volume = max_total_rebalance_volume - total_rebalance_volume # Select a target: for destination_rse in rses_under_ratio: if available_source_rebalance_volume > 0: if destination_rse['receive_volume'] >= max_rse_rebalance_volume: continue available_target_rebalance_volume = max_rse_rebalance_volume - destination_rse[ 'receive_volume'] if available_target_rebalance_volume >= available_source_rebalance_volume: available_target_rebalance_volume = available_source_rebalance_volume print('Rebalance %dTB from %s(%f) to %s(%f)' % (available_target_rebalance_volume / 1E12, source_rse['rse'], source_rse['ratio'], destination_rse['rse'], destination_rse['ratio'])) expr = destination_rse['rse'] rebalance_rse(rse_id=source_rse['id'], max_bytes=available_target_rebalance_volume, dry_run=False, comment='T2 Background rebalancing', force_expression=expr) destination_rse[ 'receive_volume'] += available_target_rebalance_volume total_rebalance_volume += available_target_rebalance_volume available_source_rebalance_volume -= available_target_rebalance_volume
def run_once(heartbeat_handler: "HeartbeatHandler", rse_expression: str, move_subscriptions: bool, use_dump: bool, dry_run: bool, **_kwargs) -> bool: must_sleep = False total_rebalance_volume = 0 worker_number, total_workers, logger = heartbeat_handler.live() logger(logging.DEBUG, "Running BB8 on rse_expression: %s", rse_expression) tolerance = config_get_float("bb8", "tolerance", default=0.05) max_total_rebalance_volume = config_get_float("bb8", "max_total_rebalance_volume", default=10 * 1e12) max_rse_rebalance_volume = config_get_float("bb8", "max_rse_rebalance_volume", default=500 * 1e9) min_total = config_get_float("bb8", "min_total", default=20 * 1e9) payload_cnt = list_payload_counts(executable="rucio-bb8", older_than=600, hash_executable=None, session=None) if rse_expression in payload_cnt: logger( logging.WARNING, "One BB8 instance already running with the same RSE expression. Stopping", ) must_sleep = True return must_sleep else: # List the RSEs represented by rse_expression try: rses = [rse for rse in parse_expression(rse_expression)] list_rses2 = [rse["rse"] for rse in rses] except InvalidRSEExpression as err: logger(logging.ERROR, err) return must_sleep # List the RSEs represented by all the RSE expressions stored in heartbeat payload list_rses1 = [] for rse_exp in payload_cnt: if rse_exp: list_rses1 = [rse["rse"] for rse in parse_expression(rse_exp)] for rse in list_rses2: if rse in list_rses1: logger( logging.WARNING, "Overlapping RSE expressions %s vs %s. Stopping", rse_exp, rse_expression, ) return must_sleep logger(logging.INFO, "Will process rebalancing on %s", rse_expression) worker_number, total_workers, logger = heartbeat_handler.live() total_primary = 0 total_secondary = 0 total_total = 0 global_ratio = float(0) for rse in rses: logger(logging.DEBUG, "Getting RSE usage on %s", rse["rse"]) rse_usage = get_rse_usage(rse_id=rse["id"]) usage_dict = {} for item in rse_usage: # TODO Check last update usage_dict[item["source"]] = { "used": item["used"], "free": item["free"], "total": item["total"], } try: rse["primary"] = (usage_dict["rucio"]["used"] - usage_dict["expired"]["used"]) rse["secondary"] = usage_dict["expired"]["used"] rse["total"] = (usage_dict["storage"]["total"] - usage_dict["min_free_space"]["used"]) rse["ratio"] = float(rse["primary"]) / float(rse["total"]) except KeyError as err: logger( logging.ERROR, "Missing source usage %s for RSE %s. Exiting", err, rse["rse"], ) break total_primary += rse["primary"] total_secondary += rse["secondary"] total_total += float(rse["total"]) rse["receive_volume"] = 0 # Already rebalanced volume in this run global_ratio = float(total_primary) / float(total_total) logger(logging.INFO, "Global ratio: %f" % (global_ratio)) for rse in sorted(rses, key=lambda k: k["ratio"]): logger( logging.INFO, "%s Sec/Prim local ratio (%f) vs global %s", rse["rse"], rse["ratio"], global_ratio, ) rses_over_ratio = sorted( [ rse for rse in rses if rse["ratio"] > global_ratio + global_ratio * tolerance ], key=lambda k: k["ratio"], reverse=True, ) rses_under_ratio = sorted( [ rse for rse in rses if rse["ratio"] < global_ratio - global_ratio * tolerance ], key=lambda k: k["ratio"], reverse=False, ) # Excluding RSEs logger(logging.DEBUG, "Excluding RSEs as destination which are too small by size:") for des in rses_under_ratio: if des["total"] < min_total: logger(logging.DEBUG, "Excluding %s", des["rse"]) rses_under_ratio.remove(des) logger(logging.DEBUG, "Excluding RSEs as sources which are too small by size:") for src in rses_over_ratio: if src["total"] < min_total: logger(logging.DEBUG, "Excluding %s", src["rse"]) rses_over_ratio.remove(src) logger( logging.DEBUG, "Excluding RSEs as destinations which are not available for write:", ) for des in rses_under_ratio: if des["availability"] & 2 == 0: logger(logging.DEBUG, "Excluding %s", des["rse"]) rses_under_ratio.remove(des) logger(logging.DEBUG, "Excluding RSEs as sources which are not available for read:") for src in rses_over_ratio: if src["availability"] & 4 == 0: logger(logging.DEBUG, "Excluding %s", src["rse"]) rses_over_ratio.remove(src) # Gets the number of active transfers per location dict_locks = get_active_locks(session=None) # Loop over RSEs over the ratio for index, source_rse in enumerate(rses_over_ratio): # The volume that would be rebalanced, not real availability of the data: available_source_rebalance_volume = int( (source_rse["primary"] - global_ratio * source_rse["secondary"]) / (global_ratio + 1)) if available_source_rebalance_volume > max_rse_rebalance_volume: available_source_rebalance_volume = max_rse_rebalance_volume if (available_source_rebalance_volume > max_total_rebalance_volume - total_rebalance_volume): available_source_rebalance_volume = ( max_total_rebalance_volume - total_rebalance_volume) # Select a target: for destination_rse in rses_under_ratio: if available_source_rebalance_volume > 0: vo_str = (" on VO {}".format(destination_rse["vo"]) if destination_rse["vo"] != "def" else "") if index == 0 and destination_rse["id"] in dict_locks: replicating_volume = dict_locks[ destination_rse["id"]]["bytes"] logger( logging.DEBUG, "Already %f TB replicating to %s%s", replicating_volume / 1e12, destination_rse["rse"], vo_str, ) destination_rse["receive_volume"] += replicating_volume if destination_rse[ "receive_volume"] >= max_rse_rebalance_volume: continue available_target_rebalance_volume = ( max_rse_rebalance_volume - destination_rse["receive_volume"]) if (available_target_rebalance_volume >= available_source_rebalance_volume): available_target_rebalance_volume = ( available_source_rebalance_volume) logger( logging.INFO, "Rebalance %d TB from %s(%f) to %s(%f)%s", available_target_rebalance_volume / 1e12, source_rse["rse"], source_rse["ratio"], destination_rse["rse"], destination_rse["ratio"], vo_str, ) expr = destination_rse["rse"] rebalance_rse( rse_id=source_rse["id"], max_bytes=available_target_rebalance_volume, dry_run=dry_run, comment="Background rebalancing", force_expression=expr, logger=logger, ) destination_rse[ "receive_volume"] += available_target_rebalance_volume total_rebalance_volume += available_target_rebalance_volume available_source_rebalance_volume -= ( available_target_rebalance_volume) must_sleep = True return must_sleep