def get_rses_to_process(rses, include_rses, exclude_rses, vos): """ Return the list of RSEs to process based on rses, include_rses and exclude_rses :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param exclude_rses: RSE expression to exclude RSEs from the Reaper. :param include_rses: RSE expression to include RSEs. :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. If None, we either use all VOs if run from "def", :returns: A list of RSEs to process """ multi_vo = config_get_bool('common', 'multi_vo', raise_exception=False, default=False) if not multi_vo: if vos: logging.log(logging.WARNING, 'Ignoring argument vos, this is only applicable in a multi-VO setup.') vos = ['def'] else: if vos: invalid = set(vos) - set([v['vo'] for v in list_vos()]) if invalid: msg = 'VO{} {} cannot be found'.format('s' if len(invalid) > 1 else '', ', '.join([repr(v) for v in invalid])) raise VONotFound(msg) else: vos = [v['vo'] for v in list_vos()] logging.log(logging.INFO, 'Reaper: This instance will work on VO%s: %s' % ('s' if len(vos) > 1 else '', ', '.join([v for v in vos]))) cache_key = 'rses_to_process' if multi_vo: cache_key += '@%s' % '-'.join(vo for vo in vos) result = REGION.get(cache_key) if result is not NO_VALUE: return result all_rses = [] for vo in vos: all_rses.extend(list_rses(filters={'vo': vo})) if rses: invalid = set(rses) - set([rse['rse'] for rse in all_rses]) if invalid: msg = 'RSE{} {} cannot be found'.format('s' if len(invalid) > 1 else '', ', '.join([repr(rse) for rse in invalid])) raise RSENotFound(msg) rses = [rse for rse in all_rses if rse['rse'] in rses] else: rses = all_rses if include_rses: included_rses = parse_expression(include_rses) rses = [rse for rse in rses if rse in included_rses] if exclude_rses: excluded_rses = parse_expression(exclude_rses) rses = [rse for rse in rses if rse not in excluded_rses] REGION.set(cache_key, rses) logging.log(logging.INFO, 'Reaper: This instance will work on RSEs: %s', ', '.join([rse['rse'] for rse in rses])) return rses
def get_conveyor_rses(rses=None, include_rses=None, exclude_rses=None, vos=None, logger=logging.log): """ Get a list of rses for conveyor :param rses: List of rses (Single-VO only) :param include_rses: RSEs to include :param exclude_rses: RSEs to exclude :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. If None, we either use all VOs if run from "def", or the current VO otherwise. :param logger: Optional decorated logger that can be passed from the calling daemons or servers. :return: List of working rses """ multi_vo = config_get_bool('common', 'multi_vo', raise_exception=False, default=False) if not multi_vo: if vos: logger(logging.WARNING, 'Ignoring argument vos, this is only applicable in a multi-VO setup.') vos = ['def'] else: if vos: invalid = set(vos) - set([v['vo'] for v in list_vos()]) if invalid: msg = 'VO{} {} cannot be found'.format('s' if len(invalid) > 1 else '', ', '.join([repr(v) for v in invalid])) raise VONotFound(msg) else: vos = [v['vo'] for v in list_vos()] logger(logging.INFO, 'This instance will work on VO%s: %s' % ('s' if len(vos) > 1 else '', ', '.join([v for v in vos]))) working_rses = [] rses_list = [] for vo in vos: rses_list.extend(list_rses(filters={'vo': vo})) if rses: working_rses = [rse for rse in rses_list if rse['rse'] in rses] if include_rses: for vo in vos: try: parsed_rses = parse_expression(include_rses, filter={'vo': vo}, session=None) except InvalidRSEExpression: logger(logging.ERROR, "Invalid RSE exception %s to include RSEs", include_rses) else: for rse in parsed_rses: if rse not in working_rses: working_rses.append(rse) if not (rses or include_rses): working_rses = rses_list if exclude_rses: try: parsed_rses = parse_expression(exclude_rses, session=None) except InvalidRSEExpression as error: logger(logging.ERROR, "Invalid RSE exception %s to exclude RSEs: %s", exclude_rses, error) else: working_rses = [rse for rse in working_rses if rse not in parsed_rses] working_rses = [rsemgr.get_rse_info(rse_id=rse['id']) for rse in working_rses] return working_rses
def convert_to_svo(old_vo, delete_vos=False, commit_changes=False, skip_history=False, echo=True): """ Converts a multi-VO database to a single-VO one by renaming the given VO and (optionally) deleting entries for other VOs and the super_root. Intended to be run on a copy of the original database that contains several VOs. :param old_vo: The 3 character string for the old VO. :param delete_vos: If True then all entries associated with a VO other than `old_vo` will be deleted. :param commit_changes: If True then changes are made against the database directly and the old super_root account will be (soft) deleted. If False, then nothing is commited and the commands needed are dumped to be run later. :param skip_history: If True then tables without FKC containing historical data will not be converted to save time. """ if not config_get_bool('common', 'multi_vo', False, False): print('Multi-VO mode is not enabled in the config file, aborting conversion.') return rename_vo(old_vo, 'def', commit_changes=commit_changes, skip_history=skip_history) s = session.get_session() if delete_vos: success_all = True for vo in list_vos(session=s): if vo['vo'] != 'def': success = remove_vo(vo['vo'], commit_changes=commit_changes, skip_history=skip_history) success_all = success_all and success if commit_changes and success_all: del_account(InternalAccount('super_root', vo='def'), session=s) s.close()
def convert_to_mvo(new_vo, description, email, create_super_root=False, commit_changes=False, skip_history=False, echo=True): """ Converts a single-VO database to a multi-VO one with the specified VO details. :param new_vo: The 3 character string for the new VO. :param description: Full description of the new VO. :param email: Admin email for the new VO. :param create_super_root: If True and the renaming was successful, then create a super_root account at VO def. :param commit_changes: If True then changes are made against the database directly. If False, then nothing is commited and the commands needed are dumped to be run later. :param skip_history: If True then tables without FKC containing historical data will not be converted to save time. """ if not config_get_bool('common', 'multi_vo', False, False): print('Multi-VO mode is not enabled in the config file, aborting conversion.') return s = session.get_session() vos = [vo['vo'] for vo in list_vos(session=s)] if new_vo not in vos: insert_new_vo = True else: insert_new_vo = False success = rename_vo('def', new_vo, insert_new_vo=insert_new_vo, description=description, email=email, commit_changes=commit_changes, skip_history=skip_history) if create_super_root and success: create_root_account(create_counters=False) s.close()
def _register_policy_package_surl_algorithms(): def try_importing_policy(vo=None): import importlib try: package = config.config_get( 'policy', 'package' + ('' if not vo else '-' + vo['vo'])) module = importlib.import_module(package) if hasattr(module, 'get_surl_algorithms'): _SURL_ALGORITHMS.update(module.get_surl_algorithms()) except (NoOptionError, NoSectionError, ImportError): pass from rucio.common import config from rucio.core.vo import list_vos try: multivo = config.config_get_bool('common', 'multi_vo') except (NoOptionError, NoSectionError): multivo = False if not multivo: # single policy package try_importing_policy() else: # policy package per VO vos = list_vos() for vo in vos: try_importing_policy(vo)
def oidc_auth(account, issuer, ui_vo=None): """ # TODO: Validate for Flask Open ID Connect Login :param account: Rucio account string :param issuer: issuer key (e.g. xdc, wlcg) as in the idpsecrets.json file :param ui_vo: 3 character string to identify the VO, if None will attempt to deduce it from `account` :returns: rendered final page or a page with error message """ if not account: account = 'webui' if ui_vo is None: all_vos = [vo['vo'] for vo in vo_core.list_vos()] valid_vos = [] for vo in all_vos: if account_exists(account, vo): valid_vos.append(vo) if len(valid_vos) == 0: return render_template( "problem.html", msg=('Cannot find any Rucio account %s at any VO.' % html_escape(account))) elif len(valid_vos) == 1: ui_vo = valid_vos[0] else: vos_with_desc = get_vo_descriptions(valid_vos) return add_cookies( make_response( render_template("select_login_method.html", oidc_issuers=AUTH_ISSUERS, saml_support=SAML_SUPPORT, possible_vos=vos_with_desc))) if not issuer: return render_template("problem.html", msg="Please provide IdP issuer.") realhome = request.environ.get( 'REQUEST_SCHEME') + '://' + request.environ.get( 'HTTP_HOST') + request.environ.get('SCRIPT_NAME') kwargs = { 'audience': None, 'auth_scope': None, 'issuer': issuer.lower(), 'auto': True, 'polling': False, 'refresh_lifetime': None, 'ip': None, 'webhome': realhome + '/oidc_final' } auth_url = auth.get_auth_oidc(account, vo=ui_vo, **kwargs) if not auth_url: return render_template( "problem.html", msg= "It was not possible to get the OIDC authentication url from the Rucio auth server. " + "In case you provided your account name, make sure it is known to Rucio." ) # NOQA: W503 return redirect(auth_url, code=303)
def get_vo_descriptions(vos): """ Gets the description for each VO in the list. :param vos: List of 3 character VO strings :returns: List of tuples containing VO string, VO description """ all_vos = vo_core.list_vos() vos_with_desc = [] for vo in all_vos: if vo['vo'] in vos: vos_with_desc.append((vo['vo'], vo['description'])) return vos_with_desc
def list_vos(issuer, vo='def'): ''' List the VOs. :param issuer: The user issuing the command. :param vo: The vo of the user issuing the command. ''' kwargs = {} if not has_permission(issuer=issuer, action='list_vos', kwargs=kwargs, vo=vo): raise exception.AccessDenied('Account {} cannot list VOs'.format(issuer)) return vo_core.list_vos()
def query_policy_packages(self): from rucio.core.vo import list_vos try: multivo = config.config_get_bool('common', 'multi_vo') except (NoOptionError, NoSectionError): multivo = False if not multivo: # single policy package self.try_importing_policy() else: # policy package per VO vos = list_vos() for vo in vos: self.try_importing_policy(vo)
def get_scope_name_regexps(): """ returns a list of all unique SCOPE_NAME_REGEXPs from all schemas """ if len(scope_name_regexps) == 0: # load schemas for all VOs here and add unique scope_name_regexps to list from rucio.core.vo import list_vos vos = list_vos() for vo in vos: if not vo['vo'] in schema_modules: load_schema_for_vo(vo['vo']) scope_name_regexp = schema_modules[vo['vo']].SCOPE_NAME_REGEXP if scope_name_regexp not in scope_name_regexps: scope_name_regexps.append(scope_name_regexp) return scope_name_regexps
def oidc_auth(account, issuer, ui_vo=None): """ Open ID Connect Login :param account: Rucio account string :param issuer: issuer key (e.g. xdc, wlcg) as in the idpsecrets.json file :param ui_vo: 3 character string to identify the VO, if None will attempt to deduce it from `account` :returns: rendered final page or a page with error message """ if not account: account = 'webui' if ui_vo is None: all_vos = [vo['vo'] for vo in vo_core.list_vos()] valid_vos = [] for vo in all_vos: if account_exists(account, vo): valid_vos.append(vo) if len(valid_vos) == 0: return RENDERER.problem( ('Cannot find any Rucio account %s at any VO.' % html_escape(account))) elif len(valid_vos) == 1: ui_vo = valid_vos[0] else: vos_with_desc = get_vo_descriptions(valid_vos) return RENDERER.select_login_method(AUTH_ISSUERS, SAML_SUPPORT, vos_with_desc) if not issuer: return RENDERER.problem("Please provide IdP issuer.") kwargs = { 'audience': None, 'auth_scope': None, 'issuer': issuer.lower(), 'auto': True, 'polling': False, 'refresh_lifetime': None, 'ip': None, 'webhome': ctx.realhome + '/oidc_final' } auth_url = auth.get_auth_oidc(account, vo=ui_vo, **kwargs) if not auth_url: return RENDERER.problem( "It was not possible to get the OIDC authentication url from the Rucio auth server. " + "In case you provided your account name, make sure it is known to Rucio." ) # NOQA: W503 return seeother(auth_url)
def run(total_workers=1, chunk_size=100, threads_per_worker=None, once=False, greedy=False, rses=[], scheme=None, exclude_rses=None, include_rses=None, vos=None, delay_seconds=0): """ Starts up the reaper threads. :param total_workers: The total number of workers. :param chunk_size: the size of chunk for deletion. :param threads_per_worker: Total number of threads created by each worker. :param once: If True, only runs one iteration of the main loop. :param greedy: If True, delete right away replicas with tombstone. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param scheme: Force the reaper to use a particular protocol/scheme, e.g., mock. :param exclude_rses: RSE expression to exclude RSEs from the Reaper. :param include_rses: RSE expression to include RSEs. :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. If None, we either use all VOs if run from "def", or the current VO otherwise. """ if rucio.db.sqla.util.is_old_db(): raise DatabaseException('Database was not updated, daemon won\'t start') logging.info('Reaper1 daemon will be deprecated and replaced by reaper2 with Rucio release 1.25 (~March 2021)!') logging.info('main: starting processes') multi_vo = config_get_bool('common', 'multi_vo', raise_exception=False, default=False) if not multi_vo: if vos: logging.warning('Ignoring argument vos, this is only applicable in a multi-VO setup.') vos = ['def'] else: if vos: invalid = set(vos) - set([v['vo'] for v in list_vos()]) if invalid: msg = 'VO{} {} cannot be found'.format('s' if len(invalid) > 1 else '', ', '.join([repr(v) for v in invalid])) raise VONotFound(msg) else: vos = [v['vo'] for v in list_vos()] logging.info('Reaper: This instance will work on VO%s: %s' % ('s' if len(vos) > 1 else '', ', '.join([v for v in vos]))) all_rses = [] for vo in vos: all_rses.extend(rse_core.list_rses(filters={'vo': vo})) if rses: invalid = set(rses) - set([rse['rse'] for rse in all_rses]) if invalid: msg = 'RSE{} {} cannot be found'.format('s' if len(invalid) > 1 else '', ', '.join([repr(rse) for rse in invalid])) raise RSENotFound(msg) rses = [rse for rse in all_rses if rse['rse'] in rses] else: rses = all_rses if exclude_rses: excluded_rses = parse_expression(exclude_rses) rses = [rse for rse in rses if rse not in excluded_rses] if include_rses: included_rses = parse_expression(include_rses) rses = [rse for rse in rses if rse in included_rses] if not rses: logging.error('Reaper: No RSEs found. Exiting.') return logging.info('Reaper: This instance will work on RSEs: ' + ', '.join([rse['rse'] for rse in rses])) threads = [] nb_rses_per_worker = int(math.ceil(len(rses) / float(total_workers))) or 1 rses = random.sample(rses, len(rses)) for worker in range(total_workers): for child in range(threads_per_worker or 1): rses_list = rses[worker * nb_rses_per_worker: worker * nb_rses_per_worker + nb_rses_per_worker] if not rses_list: logging.warning('Reaper: Empty RSEs list for worker %(worker)s' % locals()) continue kwargs = {'worker_number': worker, 'child_number': child, 'total_children': threads_per_worker or 1, 'once': once, 'chunk_size': chunk_size, 'greedy': greedy, 'rses': rses_list, 'delay_seconds': delay_seconds, 'scheme': scheme} threads.append(threading.Thread(target=reaper, kwargs=kwargs, name='Worker: %s, child: %s' % (worker, child))) [t.start() for t in threads] while threads[0].is_alive(): [t.join(timeout=3.14) for t in threads]
def run(total_workers=1, chunk_size=100, once=False, rses=[], scheme=None, exclude_rses=None, include_rses=None, vos=None, delay_seconds=0): """ Starts up the reaper threads. :param total_workers: The total number of workers. :param chunk_size: the size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. (Single-VO only) :param scheme: Force the reaper to use a particular protocol/scheme, e.g., mock. :param exclude_rses: RSE expression to exclude RSEs from the Reaper. :param include_rses: RSE expression to include RSEs. :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. If None, we either use all VOs if run from "def", or the current VO otherwise. """ setup_logging() if rucio.db.sqla.util.is_old_db(): raise DatabaseException('Database was not updated, daemon won\'t start') logging.info('main: starting processes') multi_vo = config_get_bool('common', 'multi_vo', raise_exception=False, default=False) if not multi_vo: if vos: logging.warning('Ignoring argument vos, this is only applicable in a multi-VO setup.') vos = ['def'] else: if vos: invalid = set(vos) - set([v['vo'] for v in list_vos()]) if invalid: msg = 'VO{} {} cannot be found'.format('s' if len(invalid) > 1 else '', ', '.join([repr(v) for v in invalid])) raise VONotFound(msg) else: vos = [v['vo'] for v in list_vos()] logging.info('Light Reaper: This instance will work on VO%s: %s' % ('s' if len(vos) > 1 else '', ', '.join([v for v in vos]))) all_rses = [] for vo in vos: all_rses.extend(rse_core.list_rses(filters={'vo': vo})) if rses: invalid = set(rses) - set([rse['rse'] for rse in all_rses]) if invalid: msg = 'RSE{} {} cannot be found'.format('s' if len(invalid) > 1 else '', ', '.join([repr(rse) for rse in invalid])) raise RSENotFound(msg) rses = [rse for rse in all_rses if rse['rse'] in rses] else: rses = all_rses if exclude_rses: excluded_rses = parse_expression(exclude_rses) rses = [rse for rse in rses if rse not in excluded_rses] if include_rses: included_rses = parse_expression(include_rses) rses = [rse for rse in rses if rse in included_rses] if not rses: logging.error('Light Reaper: No RSEs found. Exiting.') return threads = [] for worker in range(total_workers): kwargs = {'worker_number': worker, 'total_workers': total_workers, 'rses': rses, 'once': once, 'chunk_size': chunk_size, 'scheme': scheme} threads.append(threading.Thread(target=reaper, kwargs=kwargs, name='Worker: %s, Total_Workers: %s' % (worker, total_workers))) [t.start() for t in threads] while threads[0].is_alive(): [t.join(timeout=3.14) for t in threads]
def declare_suspicious_replicas_bad(once=False, younger_than=3, nattempts=10, rse_expression='MOCK', vos=None, max_replicas_per_rse=100, sleep_time=60): """ Main loop to check for available replicas which are labeled as suspicious Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table and available on other RSE. Finds surls of these replicas and declares them as bad. :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped. :param younger_than: The number of days since which bad_replicas table will be searched for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'), but 'AVAILABLE' on other RSE(s). :param nattempts: The minimum number of appearances in the bad_replica DB table in order to appear in the resulting list of replicas for recovery. :param rse_expression: Search for suspicious replicas on RSEs matching the 'rse_expression'. :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. If None, we either use all VOs if run from "def", :param max_replicas_per_rse: Maximum number of replicas which are allowed to be labeled as bad per RSE. If more is found, processing is skipped and warning is printed. :param sleep_time: Thread sleep time after each chunk of work. :returns: None """ # assembling the worker name identifier ('executable') including the rses from <rse_expression> # in order to have the possibility to detect a start of a second instance with the same set of RSES executable = argv[0] multi_vo = config_get_bool('common', 'multi_vo', raise_exception=False, default=False) if not multi_vo: if vos: logging.warning( 'Ignoring argument vos, this is only applicable in a multi-VO setup.' ) vos = ['def'] else: if vos: invalid = set(vos) - set([v['vo'] for v in list_vos()]) if invalid: msg = 'VO{} {} cannot be found'.format( 's' if len(invalid) > 1 else '', ', '.join([repr(v) for v in invalid])) raise VONotFound(msg) else: vos = [v['vo'] for v in list_vos()] logging.info('replica_recoverer: This instance will work on VO%s: %s' % ('s' if len(vos) > 1 else '', ', '.join([v for v in vos]))) # Don't require a result from the expression at each VO, only raise if we can't get a result from any of them rses = [] exceptions_raised = 0 for vo in vos: try: parsed_rses = parse_expression(expression=rse_expression, filter_={'vo': vo}) except InvalidRSEExpression: exceptions_raised += 1 parsed_rses = [] for rse in parsed_rses: rses.append(rse['id']) if exceptions_raised == len(vos): raise InvalidRSEExpression('RSE Expression resulted in an empty set.') rses.sort() executable += ' --rse-expression ' + str(rses) sanity_check(executable=executable, hostname=socket.gethostname()) # make an initial heartbeat - expected only one replica-recoverer thread on one node # heartbeat mechanism is used in this daemon only for information purposes # (due to expected low load, the actual DB query does not filter the result based on worker number) live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) # wait a moment in case all workers started at the same time GRACEFUL_STOP.wait(1) while not GRACEFUL_STOP.is_set(): try: # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon) heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) total_workers = heartbeat['nr_threads'] worker_number = heartbeat['assign_thread'] # there is only 1 worker allowed for this daemon if total_workers != 1: logging.error( 'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.', socket.gethostname()) die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) break start = time.time() logging.info( 'replica_recoverer[%i/%i]: ready to query replicas at RSE %s,' + ' reported suspicious in the last %i days at least %i times which are available on other RSEs.', # NOQA: W503 worker_number, total_workers, rse_expression, younger_than, nattempts) getfileskwargs = { 'younger_than': younger_than, 'nattempts': nattempts, 'exclude_states': ['B', 'R', 'D', 'L', 'T'], 'available_elsewhere': True, 'is_suspicious': True } # Don't require a result from the expression at each VO, only raise if we can't get a result from any of them recoverable_replicas = [] exceptions_raised = 0 for vo in vos: try: recoverable_replicas.extend( get_suspicious_files(rse_expression, filter_={'vo': vo}, **getfileskwargs)) except InvalidRSEExpression: exceptions_raised += 1 if exceptions_raised == len(vos): raise InvalidRSEExpression( 'RSE Expression resulted in an empty set.') logging.info( 'replica_recoverer[%i/%i]: suspicious replica query took %.2f seconds, total of %i replicas were found.', worker_number, total_workers, time.time() - start, len(recoverable_replicas)) if not recoverable_replicas and not once: logging.info( 'replica_recoverer[%i/%i]: found %i recoverable suspicious replicas.', worker_number, total_workers, len(recoverable_replicas)) daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) else: logging.info( 'replica_recoverer[%i/%i]: looking for replica surls.', worker_number, total_workers) start = time.time() surls_to_recover = { } # dictionary of { vo1: {rse1: [surl1, surl2, ... ], rse2: ...}, vo2:... } cnt_surl_not_found = 0 for replica in recoverable_replicas: scope = replica['scope'] name = replica['name'] vo = scope.vo rse = replica['rse'] rse_id = replica['rse_id'] if GRACEFUL_STOP.is_set(): break if vo not in surls_to_recover: surls_to_recover[vo] = {} if rse_id not in surls_to_recover[vo]: surls_to_recover[vo][rse_id] = [] # for each suspicious replica, we get its surl through the list_replicas function surl_not_found = True for rep in list_replicas([{'scope': scope, 'name': name}]): for site in rep['rses']: if site == rse_id: surls_to_recover[vo][rse_id].append( rep['rses'][site][0]) surl_not_found = False if surl_not_found: cnt_surl_not_found += 1 logging.warning( 'replica_recoverer[%i/%i]: skipping suspicious replica %s on %s, no surls were found.', worker_number, total_workers, name, rse) logging.info( 'replica_recoverer[%i/%i]: found %i/%i surls (took %.2f seconds), declaring them as bad replicas now.', worker_number, total_workers, len(recoverable_replicas) - cnt_surl_not_found, len(recoverable_replicas), time.time() - start) for vo in surls_to_recover: for rse_id in surls_to_recover[vo]: logging.info( 'replica_recoverer[%i/%i]: ready to declare %i bad replica(s) on %s: %s.', worker_number, total_workers, len(surls_to_recover[vo][rse_id]), rse, str(surls_to_recover[vo][rse_id])) if len(surls_to_recover[vo] [rse_id]) > max_replicas_per_rse: logging.warning( 'replica_recoverer[%i/%i]: encountered more than %i suspicious replicas (%s) on %s. Please investigate.', worker_number, total_workers, max_replicas_per_rse, str(len(surls_to_recover[vo][rse_id])), rse) else: declare_bad_file_replicas( pfns=surls_to_recover[vo][rse_id], reason='Suspicious. Automatic recovery.', issuer=InternalAccount('root', vo=vo), status=BadFilesStatus.BAD, session=None) logging.info( 'replica_recoverer[%i/%i]: finished declaring bad replicas on %s.', worker_number, total_workers, rse) except (DatabaseException, DatabaseError) as err: if match('.*QueuePool.*', str(err.args[0])): logging.warning(traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) elif match('.*ORA-03135.*', str(err.args[0])): logging.warning(traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) else: logging.critical(traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) except Exception as err: logging.critical(traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) if once: break die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) logging.info('replica_recoverer[%i/%i]: graceful stop done', worker_number, total_workers)
def declare_suspicious_replicas_bad(once=False, younger_than=3, nattempts=10, vos=None, limit_suspicious_files_on_rse=5, sleep_time=300): """ Main loop to check for available replicas which are labeled as suspicious. Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table and available on other RSE. Finds surls of these replicas and declares them as bad. :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped. :param younger_than: The number of days since which bad_replicas table will be searched for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'), but 'AVAILABLE' on other RSE(s). :param nattempts: The minimum number of appearances in the bad_replica DB table in order to appear in the resulting list of replicas for recovery. :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. If None, we either use all VOs if run from "def", :param limit_suspicious_files_on_rse: Maximum number of suspicious replicas on an RSE before that RSE is considered problematic and the suspicious replicas on that RSE are labeled as 'TEMPORARY_UNAVAILABLE'. :param sleep_time: The daemon should not run too often. If the daemon's runtime is quicker than sleep_time, then it should sleep until sleep_time is over. :returns: None """ # assembling the worker name identifier ('executable') including the rses from <rse_expression> # in order to have the possibility to detect a start of a second instance with the same set of RSES executable = argv[0] prepend_str = 'replica_recoverer: ' logger = formatted_logger(logging.log, prepend_str + '%s') multi_vo = config_get_bool('common', 'multi_vo', raise_exception=False, default=False) if not multi_vo: if vos: logger(logging.WARNING, 'Ignoring argument vos, this is only applicable in a multi-VO setup.') vos = ['def'] else: if vos: invalid = set(vos) - set([v['vo'] for v in list_vos()]) if invalid: msg = 'VO{} {} cannot be found'.format('s' if len(invalid) > 1 else '', ', '.join([repr(v) for v in invalid])) raise VONotFound(msg) else: vos = [v['vo'] for v in list_vos()] logger(logging.INFO, 'replica_recoverer: This instance will work on VO%s: %s' % ('s' if len(vos) > 1 else '', ', '.join([v for v in vos]))) sanity_check(executable=executable, hostname=socket.gethostname()) # make an initial heartbeat - expected only one replica-recoverer thread on one node # heartbeat mechanism is used in this daemon only for information purposes # (due to expected low load, the actual DB query does not filter the result based on worker number) heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) prepend_str = 'replica_recoverer [%i/%i] : ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') # wait a moment in case all workers started at the same time GRACEFUL_STOP.wait(1) while not GRACEFUL_STOP.is_set(): try: # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon) heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) total_workers = heartbeat['nr_threads'] worker_number = heartbeat['assign_thread'] + 1 # there is only 1 worker allowed for this daemon if total_workers != 1: logger(logging.ERROR, 'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.', socket.gethostname()) die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) break prepend_str = 'replica_recoverer[%s/%s]: ' % (worker_number, total_workers) logger = formatted_logger(logging.log, prepend_str + '%s') start = time.time() try: json_file = open("/opt/rucio/etc/suspicious_replica_recoverer.json") except: logger(logging.WARNING, "An error occured whilst trying to open the JSON file.") break try: json_data = json.load(json_file) except ValueError: logger(logging.WARNING, "No JSON object could be decoded.") # Checking that the json file is formatedd properly. for i, entry in enumerate(json_data): if "datatype" not in entry or "action" not in entry: logger(logging.ERROR, 'Entry %s in the json file is incomplete (missing either "datatype" or "action").', i) break logger(logging.INFO, 'Ready to query replicas that were reported as suspicious in the last %s days at least %s times.', younger_than, nattempts) getfileskwargs = {'younger_than': younger_than, 'nattempts': nattempts, 'exclude_states': ['B', 'R', 'D', 'L', 'T'], 'is_suspicious': True} for vo in vos: logger(logging.INFO, 'Start replica recovery for VO: %s', vo) recoverable_replicas = {} if vo not in recoverable_replicas: recoverable_replicas[vo] = {} # rse_list = sorted([rse for rse in parse_expression('enable_suspicious_file_recovery=true', filter={'vo': vo})], key=lambda k: k['rse']) rse_list = sorted([rse for rse in parse_expression('enable_suspicious_file_recovery=true') if rse['vo'] == vo], key=lambda k: k['rse']) logger(logging.DEBUG, "List of RSEs with enable_suspicious_file_recovery = True:") for i in rse_list: logger(logging.DEBUG, '%s', i) for rse in rse_list: time_start_rse = time.time() rse_expr = rse['rse'] cnt_surl_not_found = 0 if rse_expr not in recoverable_replicas[vo]: recoverable_replicas[vo][rse_expr] = {} # Get a dictionary of the suspicious replicas on the RSE that have available copies on other RSEs suspicious_replicas_avail_elsewhere = get_suspicious_files(rse_expr, available_elsewhere=SuspiciousAvailability["EXIST_COPIES"].value, filter_={'vo': vo}, **getfileskwargs) # Get the suspicious replicas that are the last remaining copies suspicious_replicas_last_copy = get_suspicious_files(rse_expr, available_elsewhere=SuspiciousAvailability["LAST_COPY"].value, filter_={'vo': vo}, **getfileskwargs) logger(logging.DEBUG, 'Suspicious replicas on %s:', rse_expr) logger(logging.DEBUG, 'Replicas with copies on other RSEs (%s):', len(suspicious_replicas_avail_elsewhere)) for i in suspicious_replicas_avail_elsewhere: logger(logging.DEBUG, '%s', i) logger(logging.DEBUG, 'Replicas that are the last remaining copy (%s):', len(suspicious_replicas_last_copy)) for i in suspicious_replicas_last_copy: logger(logging.DEBUG, '%s', i) # RSEs that aren't available shouldn't have suspicious replicas showing up. Skip to next RSE. if (rse['availability'] not in {4, 5, 6, 7}) and ((len(suspicious_replicas_avail_elsewhere) > 0) or (len(suspicious_replicas_last_copy) > 0)): logger(logging.WARNING, "%s is not available (availability: %s), yet it has suspicious replicas. Please investigate. \n", rse_expr, rse['availability']) continue if suspicious_replicas_avail_elsewhere: for replica in suspicious_replicas_avail_elsewhere: if vo == replica['scope'].vo: scope = replica['scope'] rep_name = replica['name'] rse_id = replica['rse_id'] surl_not_found = True for rep in list_replicas([{'scope': scope, 'name': rep_name}]): for rse_ in rep['rses']: if rse_ == rse_id: recoverable_replicas[vo][rse_expr][rep_name] = {'name': rep_name, 'rse_id': rse_id, 'scope': scope, 'surl': rep['rses'][rse_][0], 'available_elsewhere': True} surl_not_found = False if surl_not_found: cnt_surl_not_found += 1 logger(logging.WARNING, 'Skipping suspicious replica %s on %s, no surls were found.', rep_name, rse_expr) if suspicious_replicas_last_copy: for replica in suspicious_replicas_last_copy: if vo == replica['scope'].vo: scope = replica['scope'] rep_name = replica['name'] rse_id = replica['rse_id'] surl_not_found = True # Should only return one rse, as there is only one replica remaining for rep in list_replicas([{'scope': scope, 'name': rep_name}]): recoverable_replicas[vo][rse_expr][rep_name] = {'name': rep_name, 'rse_id': rse_id, 'scope': scope, 'surl': rep['rses'][rse_id][0], 'available_elsewhere': False} surl_not_found = False if surl_not_found: cnt_surl_not_found += 1 logger(logging.WARNING, 'Skipping suspicious replica %s on %s, no surls were found.', rep_name, rse_expr) logger(logging.INFO, 'Suspicious replica query took %s seconds on %s and found %i suspicious replicas. The pfns for %s/%s replicas were found.', time.time() - time_start_rse, rse_expr, len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy), len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy) - cnt_surl_not_found, len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy)) if len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy) != 0: logger(logging.DEBUG, 'List of replicas on %s for which the pfns have been found:', rse_expr) for i in recoverable_replicas[vo][rse_expr]: logger(logging.DEBUG, '%s', i) # Log file is long and hard to read -> implement some spacing logger(logging.INFO, 'All RSEs have been checked for suspicious replicas. Total time: %s seconds.', time.time() - start) logger(logging.INFO, 'Begin check for problematic RSEs.') time_start_check_probl = time.time() # If an RSE has more than *limit_suspicious_files_on_rse* suspicious files, then there might be a problem with the RSE. # The suspicious files are marked as temporarily unavailable. list_problematic_rses = [] for rse_key in list(recoverable_replicas[vo].keys()): if len(recoverable_replicas[vo][rse_key].values()) > limit_suspicious_files_on_rse: list_problematic_rses.append(rse_key) surls_list = [] for replica_value in recoverable_replicas[vo][rse_key].values(): surls_list.append(replica_value['surl']) add_bad_pfns(pfns=surls_list, account=InternalAccount('root', vo=vo), state='TEMPORARY_UNAVAILABLE', expires_at=datetime.utcnow() + timedelta(days=3)) logger(logging.INFO, "%s is problematic (more than %s suspicious replicas). Send a Jira ticket for the RSE (to be implemented).", rse_key, limit_suspicious_files_on_rse) logger(logging.INFO, "The following files on %s have been marked as TEMPORARILY UNAVAILABLE:", rse_key) for rse_values in recoverable_replicas[vo][rse_key].values(): logger(logging.INFO, 'Scope: %s Name: %s', rse_values['scope'], rse_values['name']) # Remove the RSE from the dictionary as it has been dealt with. del recoverable_replicas[vo][rse_key] logger(logging.INFO, "Following RSEs were deemed problematic (total: %s)", len(list_problematic_rses)) for rse in list_problematic_rses: logger(logging.INFO, "%s", rse) # Label suspicious replicas as bad if they have oher copies on other RSEs (that aren't also marked as suspicious). # If they are the last remaining copies, deal with them differently. for rse_key in list(recoverable_replicas[vo].keys()): files_to_be_declared_bad = [] files_to_be_ignored = [] # Remove RSEs from dictionary that don't have any suspicious replicas if len(recoverable_replicas[vo][rse_key]) == 0: del recoverable_replicas[vo][rse_key] continue # Get the rse_id by going to one of the suspicious replicas from that RSE and reading it from there rse_id = list(recoverable_replicas[vo][rse_key].values())[0]['rse_id'] for replica_key in list(recoverable_replicas[vo][rse_key].keys()): if recoverable_replicas[vo][rse_key][replica_key]['available_elsewhere'] is True: # Replicas with other copies on at least one other RSE can safely be labeled as bad files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) # Remove replica from dictionary del recoverable_replicas[vo][rse_key][replica_key] elif recoverable_replicas[vo][rse_key][replica_key]['available_elsewhere'] is False: if (recoverable_replicas[vo][rse_key][replica_key]['name'].startswith("log.")) or (recoverable_replicas[vo][rse_key][replica_key]['name'].startswith("user")): # Don't keep log files or user files files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) del recoverable_replicas[vo][rse_key][replica_key] else: # Deal with replicas based on their metadata. file_metadata = get_metadata(recoverable_replicas[vo][rse_key][replica_key]["scope"], recoverable_replicas[vo][rse_key][replica_key]["name"]) if file_metadata["datatype"] is None: # "None" type has no function "split()" files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) continue for i in json_data: if i["datatype"] == file_metadata["datatype"].split("_")[-1]: action = i["action"] if action == "ignore": files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) elif action == "declare bad": files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) else: logger(logging.WARNING, "RSE: %s, replica name %s, surl %s: Match for the metadata 'datatype' (%s) of replica found in json file, but no match for 'action' (%s)", rse_key, replica_key, recoverable_replicas[vo][rse_key][replica_key]['surl'], i["datatype"], i["action"]) break else: # If no policy has be set, default to ignoring the file (no action taken). files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) logger(logging.INFO, '(%s) Remaining replicas (pfns) that will be ignored:', rse_key) for i in files_to_be_ignored: logger(logging.INFO, '%s', i) logger(logging.INFO, '(%s) Remaining replica (pfns) that will be declared BAD:', rse_key) for i in files_to_be_declared_bad: logger(logging.INFO, '%s', i) if files_to_be_declared_bad: logger(logging.INFO, 'Ready to declare %s bad replica(s) on %s (RSE id: %s).', len(files_to_be_declared_bad), rse_key, str(rse_id)) declare_bad_file_replicas(pfns=files_to_be_declared_bad, reason='Suspicious. Automatic recovery.', issuer=InternalAccount('root', vo=vo), session=None) logger(logging.INFO, 'Finished declaring bad replicas on %s.\n', rse_key) logger(logging.INFO, 'Finished checking for problematic RSEs and declaring bad replicas. Total time: %s seconds.', time.time() - time_start_check_probl) time_passed = time.time() - start logger(logging.INFO, 'Total time: %s seconds', time_passed) daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) except (DatabaseException, DatabaseError) as err: if match('.*QueuePool.*', str(err.args[0])): logger(logging.WARNING, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) elif match('.*ORA-03135.*', str(err.args[0])): logger(logging.WARNING, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) else: logger(logging.CRITICAL, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) except Exception as err: logger(logging.CRITICAL, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) if once: break die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) logger(logging.INFO, 'Graceful stop done.')