def kronos_dataset(thread=0, dataset_queue=None, sleep_time=60): logging.info('kronos-dataset[%d/?] starting', thread) executable = 'kronos-dataset' hostname = socket.gethostname() pid = getpid() hb_thread = current_thread() dataset_wait = config_get_int('tracer-kronos', 'dataset_wait') start = datetime.now() sanity_check(executable=executable, hostname=hostname) while not graceful_stop.is_set(): start_time = time() heart_beat = live(executable, hostname, pid, hb_thread) prepend_str = 'kronos-dataset[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') if (datetime.now() - start).seconds > dataset_wait: __update_datasets(dataset_queue, logger=logger) start = datetime.now() daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=graceful_stop) die(executable=executable, hostname=hostname, pid=pid, thread=thread) # once again for the backlog logger(logging.INFO, 'cleaning dataset backlog before shutdown...') __update_datasets(dataset_queue)
def account_update(once=False, sleep_time=10): """ Main loop to check and update the Account Counters. """ logging.info('account_update: starting') logging.info('account_update: started') # Make an initial heartbeat so that all abacus-account daemons have the correct worker number on the next try executable = 'abacus-account' hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() live(executable=executable, hostname=hostname, pid=pid, thread=current_thread) while not graceful_stop.is_set(): try: # Heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread) # Select a bunch of rses for to update for this worker start = time.time() # NOQA account_rse_ids = get_updated_account_counters(total_workers=heartbeat['nr_threads'], worker_number=heartbeat['assign_thread']) logging.debug('Index query time %f size=%d' % (time.time() - start, len(account_rse_ids))) # If the list is empty, sent the worker to sleep if not account_rse_ids and not once: logging.info('account_update[%s/%s] did not get any work' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1)) daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=graceful_stop) else: for account_rse_id in account_rse_ids: if graceful_stop.is_set(): break start_time = time.time() update_account_counter(account=account_rse_id[0], rse_id=account_rse_id[1]) logging.debug('account_update[%s/%s]: update of account-rse counter "%s-%s" took %f' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, account_rse_id[0], account_rse_id[1], time.time() - start_time)) except Exception: logging.error(traceback.format_exc()) if once: break logging.info('account_update: graceful stop requested') die(executable=executable, hostname=hostname, pid=pid, thread=current_thread) logging.info('account_update: graceful stop done')
def preparer(once, sleep_time, bulk, partition_wait_time=10): # Make an initial heartbeat so that all instanced daemons have the correct worker number on the next try logger_prefix = executable = 'conveyor-preparer' with HeartbeatHandler(executable=executable, logger_prefix=logger_prefix) as heartbeat_handler: if partition_wait_time is not None: graceful_stop.wait(partition_wait_time ) # gathering of daemons/threads on first start while not graceful_stop.is_set(): start_time = time() heart_beat, logger = heartbeat_handler.live() worker_number = heart_beat['assign_thread'] total_workers = heart_beat['nr_threads'] prefix = 'conveyor-preparer[%s/%s] ' % (worker_number, total_workers) daemon_logger = formatted_logger(logging.log, prefix + '%s') count = 0 try: count, updated_msg = run_once(total_workers=total_workers, worker_number=worker_number, limit=bulk, logger=logger) except RucioException: daemon_logger(logging.ERROR, 'errored with a RucioException, retrying later', exc_info=True) updated_msg = 'errored' if once: break end_time = time() time_diff = end_time - start_time logger(logging.INFO, '%s, taking %.3f seconds' % (updated_msg, time_diff)) if count < bulk: daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=graceful_stop, logger=logger)
def rse_update(once=False, sleep_time=10): """ Main loop to check and update the RSE Counters. """ # Make an initial heartbeat so that all abacus-rse daemons have the correct worker number on the next try executable = 'abacus-rse' hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() live(executable=executable, hostname=hostname, pid=pid, thread=current_thread) while not graceful_stop.is_set(): try: # Heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread) prepend_str = 'rse_update[%i/%i] : ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') # Select a bunch of rses for to update for this worker start = time.time() # NOQA rse_ids = get_updated_rse_counters(total_workers=heartbeat['nr_threads'], worker_number=heartbeat['assign_thread']) logger(logging.DEBUG, 'Index query time %f size=%d' % (time.time() - start, len(rse_ids))) # If the list is empty, sent the worker to sleep if not rse_ids and not once: logger(logging.INFO, 'did not get any work') daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=graceful_stop) else: for rse_id in rse_ids: if graceful_stop.is_set(): break start_time = time.time() update_rse_counter(rse_id=rse_id) logger(logging.DEBUG, 'update of rse "%s" took %f' % (rse_id, time.time() - start_time)) except Exception: logger(logging.ERROR, traceback.format_exc()) if once: break logging.info('rse_update: graceful stop requested') die(executable=executable, hostname=hostname, pid=pid, thread=current_thread) logging.info('rse_update: graceful stop done')
def preparer(once, sleep_time, bulk): # Make an initial heartbeat so that all instanced daemons have the correct worker number on the next try executable = 'conveyor-preparer' hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() worker_number = current_thread total_workers = '?' prefix = 'conveyor-preparer[%s/%s] ' % (worker_number, total_workers) daemon_logger = formatted_logger(logging.log, prefix + '%s') heartbeat.sanity_check(executable=executable, hostname=hostname, pid=pid, thread=current_thread) try: graceful_stop.wait(10) # gathering of daemons/threads on first start while not graceful_stop.is_set(): start_time = time() pulse = heartbeat.live(executable=executable, hostname=hostname, pid=pid, thread=current_thread) worker_number = pulse['assign_thread'] total_workers = pulse['nr_threads'] prefix = 'conveyor-preparer[%s/%s] ' % (worker_number, total_workers) daemon_logger = formatted_logger(logging.log, prefix + '%s') try: updated_msg = run_once(total_workers=total_workers, worker_number=worker_number, limit=bulk, logger=daemon_logger) except RucioException: daemon_logger(logging.ERROR, 'errored with a RucioException, retrying later', exc_info=True) updated_msg = 'errored' if once: break end_time = time() time_diff = end_time - start_time daemon_logger(logging.INFO, '%s, taking %.3f seconds' % (updated_msg, time_diff)) daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=graceful_stop, logger=daemon_logger) daemon_logger(logging.INFO, 'gracefully stopping') finally: heartbeat.die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
def collection_replica_update(once=False, limit=1000, sleep_time=10): """ Main loop to check and update the collection replicas. """ logging.info('collection_replica_update: starting') logging.info('collection_replica_update: started') # Make an initial heartbeat so that all abacus-collection-replica daemons have the correct worker number on the next try executable = 'abacus-collection-replica' hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() live(executable=executable, hostname=hostname, pid=pid, thread=current_thread) while not graceful_stop.is_set(): try: # Heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread) # Select a bunch of collection replicas for to update for this worker start = time.time() # NOQA replicas = get_cleaned_updated_collection_replicas( total_workers=heartbeat['nr_threads'] - 1, worker_number=heartbeat['assign_thread'], limit=limit) logging.debug('Index query time %f size=%d' % (time.time() - start, len(replicas))) # If the list is empty, sent the worker to sleep if not replicas and not once: logging.info( 'collection_replica_update[%s/%s] did not get any work' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1)) daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=graceful_stop) else: for replica in replicas: if graceful_stop.is_set(): break start_time = time.time() update_collection_replica(replica) logging.debug( 'collection_replica_update[%s/%s]: update of collection replica "%s" took %f' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, replica['id'], time.time() - start_time)) if limit and len(replicas) < limit and not once: daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=graceful_stop) except Exception: logging.error(traceback.format_exc()) if once: break logging.info('collection_replica_update: graceful stop requested') die(executable=executable, hostname=hostname, pid=pid, thread=current_thread) logging.info('collection_replica_update: graceful stop done')
def rule_repairer(once=False, sleep_time=60): """ Main loop to check for STUCK replication rules """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() paused_rules = {} # {rule_id: datetime} # Make an initial heartbeat so that all judge-repairers have the correct worker number on the next try executable = 'judge-repairer' live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=60 * 30) graceful_stop.wait(1) while not graceful_stop.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=60 * 30) start = time.time() # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] # Select a bunch of rules for this worker to repair rules = get_stuck_rules(total_workers=heartbeat['nr_threads'], worker_number=heartbeat['assign_thread'], delta=-1 if once else 1800, limit=100, blocked_rules=[key for key in paused_rules]) logging.debug('rule_repairer[%s/%s] index query time %f fetch size is %d' % (heartbeat['assign_thread'], heartbeat['nr_threads'], time.time() - start, len(rules))) if not rules and not once: logging.debug('rule_repairer[%s/%s] did not get any work (paused_rules=%s)' % (heartbeat['assign_thread'], heartbeat['nr_threads'], str(len(paused_rules)))) daemon_sleep(start, sleep_time, graceful_stop) else: for rule_id in rules: rule_id = rule_id[0] logging.info('rule_repairer[%s/%s]: Repairing rule %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'], rule_id)) if graceful_stop.is_set(): break try: start = time.time() repair_rule(rule_id=rule_id) logging.debug('rule_repairer[%s/%s]: repairing of %s took %f' % (heartbeat['assign_thread'], heartbeat['nr_threads'], rule_id, time.time() - start)) except (DatabaseException, DatabaseError) as e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(600, 2400)) logging.warning('rule_repairer[%s/%s]: Locks detected for %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'], rule_id)) record_counter('rule.judge.exceptions.LocksDetected') elif match('.*QueuePool.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.critical(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except Exception as e: logging.critical(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) if once: break die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
def atropos(thread, bulk, date_check, dry_run=True, grace_period=86400, once=True, unlock=False, spread_period=0, purge_replicas=False, sleep_time=60): """ Creates an Atropos Worker that gets a list of rules which have an eol_at expired and delete them. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param grace_period: The grace_period for the rules. :param once: Run only once. :param sleep_time: Thread sleep time after each chunk of work. """ executable = 'atropos' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) now = datetime.datetime.now() hb = heartbeat.live(executable, hostname, pid, hb_thread) time.sleep(10) hb = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'], hb['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.DEBUG, 'Starting worker') summary = {} lifetime_exceptions = {} rand = random.Random(hb['assign_thread']) for excep in rucio.core.lifetime_exception.list_exceptions(exception_id=None, states=[LifetimeExceptionsState.APPROVED, ], session=None): key = '{}:{}'.format(excep['scope'].internal, excep['name']) if key not in lifetime_exceptions: lifetime_exceptions[key] = excep['expires_at'] elif lifetime_exceptions[key] < excep['expires_at']: lifetime_exceptions[key] = excep['expires_at'] logger(logging.DEBUG, '%s active exceptions' % len(lifetime_exceptions)) if not dry_run and date_check > now: logger(logging.ERROR, 'Atropos cannot run in non-dry-run mode for date in the future') else: while not GRACEFUL_STOP.is_set(): hb = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'], hb['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') stime = time.time() try: rules = get_rules_beyond_eol(date_check, thread, hb['nr_threads'], session=None) logger(logging.INFO, '%s rules to process' % (len(rules))) for rule_idx, rule in enumerate(rules, start=1): did = '%s:%s' % (rule.scope, rule.name) did_key = '{}:{}'.format(rule.scope.internal, rule.name) logger(logging.DEBUG, 'Working on rule %s on DID %s on %s' % (rule.id, did, rule.rse_expression)) if (rule_idx % 1000) == 0: logger(logging.INFO, '%s/%s rules processed' % (rule_idx, len(rules))) # We compute the expected eol_at try: rses = parse_expression(rule.rse_expression, filter_={'vo': rule.account.vo}) except InvalidRSEExpression: logger(logging.WARNING, 'Rule %s has an RSE expression that results in an empty set: %s' % (rule.id, rule.rse_expression)) continue eol_at = rucio.core.lifetime_exception.define_eol(rule.scope, rule.name, rses) if eol_at != rule.eol_at: logger(logging.WARNING, 'The computed eol %s differs from the one recorded %s for rule %s on %s at %s' % (eol_at, rule.eol_at, rule.id, did, rule.rse_expression)) try: update_rule(rule.id, options={'eol_at': eol_at}) except RuleNotFound: logger(logging.WARNING, 'Cannot find rule %s on DID %s' % (rule.id, did)) continue # Check the exceptions if did_key in lifetime_exceptions: if eol_at > lifetime_exceptions[did_key]: logger(logging.INFO, 'Rule %s on DID %s on %s has longer expiration date than the one requested : %s' % (rule.id, did, rule.rse_expression, lifetime_exceptions[did_key])) else: # If eol_at < requested extension, update eol_at logger(logging.INFO, 'Updating rule %s on DID %s on %s according to the exception till %s' % (rule.id, did, rule.rse_expression, lifetime_exceptions[did_key])) eol_at = lifetime_exceptions[did_key] try: update_rule(rule.id, options={'eol_at': lifetime_exceptions[did_key]}) except RuleNotFound: logger(logging.WARNING, 'Cannot find rule %s on DID %s' % (rule.id, did)) continue # Now check that the new eol_at is expired if eol_at and eol_at <= date_check: set_metadata(scope=rule.scope, name=rule.name, key='eol_at', value=eol_at) no_locks = True for lock in get_dataset_locks(rule.scope, rule.name): if lock['rule_id'] == rule[4]: no_locks = False if lock['rse_id'] not in summary: summary[lock['rse_id']] = {} if did_key not in summary[lock['rse_id']]: summary[lock['rse_id']][did_key] = {'length': lock['length'] or 0, 'bytes': lock['bytes'] or 0} if no_locks: logger(logging.WARNING, 'Cannot find a lock for rule %s on DID %s' % (rule.id, did)) if not dry_run: lifetime = grace_period + rand.randrange(spread_period + 1) logger(logging.INFO, 'Setting %s seconds lifetime for rule %s' % (lifetime, rule.id)) options = {'lifetime': lifetime} if purge_replicas: options['purge_replicas'] = True if rule.locked and unlock: logger(logging.INFO, 'Unlocking rule %s', rule.id) options['locked'] = False try: update_rule(rule.id, options=options) except RuleNotFound: logger(logging.WARNING, 'Cannot find rule %s on DID %s' % (rule.id, did)) continue except Exception: exc_type, exc_value, exc_traceback = exc_info() logger(logging.CRITICAL, ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip()) for rse_id in summary: tot_size, tot_files, tot_datasets = 0, 0, 0 for did in summary[rse_id]: tot_datasets += 1 tot_files += summary[rse_id][did].get('length', 0) tot_size += summary[rse_id][did].get('bytes', 0) vo = get_rse_vo(rse_id=rse_id) logger(logging.INFO, 'For RSE %s %s %s datasets will be deleted representing %s files and %s bytes' % (get_rse_name(rse_id=rse_id), '' if vo == 'def' else 'on VO ' + vo, tot_datasets, tot_files, tot_size)) if once: break else: daemon_sleep(start_time=stime, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) logger(logging.INFO, 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop done')
def undertaker(worker_number=1, total_workers=1, chunk_size=5, once=False, sleep_time=60): """ Main loop to select and delete dids. """ logging.info('Undertaker(%s): starting', worker_number) logging.info('Undertaker(%s): started', worker_number) executable = 'undertaker' hostname = socket.gethostname() pid = os.getpid() thread = threading.current_thread() sanity_check(executable=executable, hostname=hostname) paused_dids = {} # {(scope, name): datetime} while not GRACEFUL_STOP.is_set(): try: heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, older_than=6000) prepend_str = 'undertaker [%i/%i] : ' % ( heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.INFO, 'Live gives {0[heartbeat]}'.format(locals())) start = time.time() # Refresh paused dids iter_paused_dids = deepcopy(paused_dids) for key in iter_paused_dids: if datetime.utcnow() > paused_dids[key]: del paused_dids[key] dids = list_expired_dids(worker_number=heartbeat['assign_thread'], total_workers=heartbeat['nr_threads'], limit=10000) dids = [ did for did in dids if (did['scope'], did['name']) not in paused_dids ] if not dids and not once: logger(logging.INFO, 'Nothing to do. sleep 60.') daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) continue for chunk in chunks(dids, chunk_size): try: logger(logging.INFO, 'Receive %s dids to delete', len(chunk)) delete_dids(dids=chunk, account=InternalAccount('root', vo='def'), expire_rules=True) logger(logging.INFO, 'Delete %s dids', len(chunk)) record_counter(name='undertaker.delete_dids', delta=len(chunk)) except RuleNotFound as error: logger(logging.ERROR, error) except (DatabaseException, DatabaseError, UnsupportedOperation) as e: if match('.*ORA-00054.*', str(e.args[0])) or match( '.*55P03.*', str(e.args[0])) or match( '.*3572.*', str(e.args[0])): for did in chunk: paused_dids[( did['scope'], did['name'])] = datetime.utcnow() + timedelta( seconds=randint(600, 2400)) record_counter( 'undertaker.delete_dids.exceptions.{exception}', labels={'exception': 'LocksDetected'}) logger(logging.WARNING, 'Locks detected for chunk') else: logger(logging.ERROR, 'Got database error %s.', str(e)) except: logging.critical(traceback.format_exc()) time.sleep(1) if once: break die(executable=executable, hostname=hostname, pid=pid, thread=thread) logging.info('Undertaker(%s): graceful stop requested', worker_number) logging.info('Undertaker(%s): graceful stop done', worker_number)
def declare_suspicious_replicas_bad(once=False, younger_than=3, nattempts=10, vos=None, limit_suspicious_files_on_rse=5, sleep_time=300): """ Main loop to check for available replicas which are labeled as suspicious. Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table and available on other RSE. Finds surls of these replicas and declares them as bad. :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped. :param younger_than: The number of days since which bad_replicas table will be searched for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'), but 'AVAILABLE' on other RSE(s). :param nattempts: The minimum number of appearances in the bad_replica DB table in order to appear in the resulting list of replicas for recovery. :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. If None, we either use all VOs if run from "def", :param limit_suspicious_files_on_rse: Maximum number of suspicious replicas on an RSE before that RSE is considered problematic and the suspicious replicas on that RSE are labeled as 'TEMPORARY_UNAVAILABLE'. :param sleep_time: The daemon should not run too often. If the daemon's runtime is quicker than sleep_time, then it should sleep until sleep_time is over. :returns: None """ # assembling the worker name identifier ('executable') including the rses from <rse_expression> # in order to have the possibility to detect a start of a second instance with the same set of RSES executable = argv[0] prepend_str = 'replica_recoverer: ' logger = formatted_logger(logging.log, prepend_str + '%s') multi_vo = config_get_bool('common', 'multi_vo', raise_exception=False, default=False) if not multi_vo: if vos: logger(logging.WARNING, 'Ignoring argument vos, this is only applicable in a multi-VO setup.') vos = ['def'] else: if vos: invalid = set(vos) - set([v['vo'] for v in list_vos()]) if invalid: msg = 'VO{} {} cannot be found'.format('s' if len(invalid) > 1 else '', ', '.join([repr(v) for v in invalid])) raise VONotFound(msg) else: vos = [v['vo'] for v in list_vos()] logger(logging.INFO, 'replica_recoverer: This instance will work on VO%s: %s' % ('s' if len(vos) > 1 else '', ', '.join([v for v in vos]))) sanity_check(executable=executable, hostname=socket.gethostname()) # make an initial heartbeat - expected only one replica-recoverer thread on one node # heartbeat mechanism is used in this daemon only for information purposes # (due to expected low load, the actual DB query does not filter the result based on worker number) heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) prepend_str = 'replica_recoverer [%i/%i] : ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') # wait a moment in case all workers started at the same time GRACEFUL_STOP.wait(1) while not GRACEFUL_STOP.is_set(): try: # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon) heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) total_workers = heartbeat['nr_threads'] worker_number = heartbeat['assign_thread'] + 1 # there is only 1 worker allowed for this daemon if total_workers != 1: logger(logging.ERROR, 'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.', socket.gethostname()) die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) break prepend_str = 'replica_recoverer[%s/%s]: ' % (worker_number, total_workers) logger = formatted_logger(logging.log, prepend_str + '%s') start = time.time() try: json_file = open("/opt/rucio/etc/suspicious_replica_recoverer.json") except: logger(logging.WARNING, "An error occured whilst trying to open the JSON file.") break try: json_data = json.load(json_file) except ValueError: logger(logging.WARNING, "No JSON object could be decoded.") # Checking that the json file is formatedd properly. for i, entry in enumerate(json_data): if "datatype" not in entry or "action" not in entry: logger(logging.ERROR, 'Entry %s in the json file is incomplete (missing either "datatype" or "action").', i) break logger(logging.INFO, 'Ready to query replicas that were reported as suspicious in the last %s days at least %s times.', younger_than, nattempts) getfileskwargs = {'younger_than': younger_than, 'nattempts': nattempts, 'exclude_states': ['B', 'R', 'D', 'L', 'T'], 'is_suspicious': True} for vo in vos: logger(logging.INFO, 'Start replica recovery for VO: %s', vo) recoverable_replicas = {} if vo not in recoverable_replicas: recoverable_replicas[vo] = {} # rse_list = sorted([rse for rse in parse_expression('enable_suspicious_file_recovery=true', filter={'vo': vo})], key=lambda k: k['rse']) rse_list = sorted([rse for rse in parse_expression('enable_suspicious_file_recovery=true') if rse['vo'] == vo], key=lambda k: k['rse']) logger(logging.DEBUG, "List of RSEs with enable_suspicious_file_recovery = True:") for i in rse_list: logger(logging.DEBUG, '%s', i) for rse in rse_list: time_start_rse = time.time() rse_expr = rse['rse'] cnt_surl_not_found = 0 if rse_expr not in recoverable_replicas[vo]: recoverable_replicas[vo][rse_expr] = {} # Get a dictionary of the suspicious replicas on the RSE that have available copies on other RSEs suspicious_replicas_avail_elsewhere = get_suspicious_files(rse_expr, available_elsewhere=SuspiciousAvailability["EXIST_COPIES"].value, filter_={'vo': vo}, **getfileskwargs) # Get the suspicious replicas that are the last remaining copies suspicious_replicas_last_copy = get_suspicious_files(rse_expr, available_elsewhere=SuspiciousAvailability["LAST_COPY"].value, filter_={'vo': vo}, **getfileskwargs) logger(logging.DEBUG, 'Suspicious replicas on %s:', rse_expr) logger(logging.DEBUG, 'Replicas with copies on other RSEs (%s):', len(suspicious_replicas_avail_elsewhere)) for i in suspicious_replicas_avail_elsewhere: logger(logging.DEBUG, '%s', i) logger(logging.DEBUG, 'Replicas that are the last remaining copy (%s):', len(suspicious_replicas_last_copy)) for i in suspicious_replicas_last_copy: logger(logging.DEBUG, '%s', i) # RSEs that aren't available shouldn't have suspicious replicas showing up. Skip to next RSE. if (rse['availability'] not in {4, 5, 6, 7}) and ((len(suspicious_replicas_avail_elsewhere) > 0) or (len(suspicious_replicas_last_copy) > 0)): logger(logging.WARNING, "%s is not available (availability: %s), yet it has suspicious replicas. Please investigate. \n", rse_expr, rse['availability']) continue if suspicious_replicas_avail_elsewhere: for replica in suspicious_replicas_avail_elsewhere: if vo == replica['scope'].vo: scope = replica['scope'] rep_name = replica['name'] rse_id = replica['rse_id'] surl_not_found = True for rep in list_replicas([{'scope': scope, 'name': rep_name}]): for rse_ in rep['rses']: if rse_ == rse_id: recoverable_replicas[vo][rse_expr][rep_name] = {'name': rep_name, 'rse_id': rse_id, 'scope': scope, 'surl': rep['rses'][rse_][0], 'available_elsewhere': True} surl_not_found = False if surl_not_found: cnt_surl_not_found += 1 logger(logging.WARNING, 'Skipping suspicious replica %s on %s, no surls were found.', rep_name, rse_expr) if suspicious_replicas_last_copy: for replica in suspicious_replicas_last_copy: if vo == replica['scope'].vo: scope = replica['scope'] rep_name = replica['name'] rse_id = replica['rse_id'] surl_not_found = True # Should only return one rse, as there is only one replica remaining for rep in list_replicas([{'scope': scope, 'name': rep_name}]): recoverable_replicas[vo][rse_expr][rep_name] = {'name': rep_name, 'rse_id': rse_id, 'scope': scope, 'surl': rep['rses'][rse_id][0], 'available_elsewhere': False} surl_not_found = False if surl_not_found: cnt_surl_not_found += 1 logger(logging.WARNING, 'Skipping suspicious replica %s on %s, no surls were found.', rep_name, rse_expr) logger(logging.INFO, 'Suspicious replica query took %s seconds on %s and found %i suspicious replicas. The pfns for %s/%s replicas were found.', time.time() - time_start_rse, rse_expr, len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy), len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy) - cnt_surl_not_found, len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy)) if len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy) != 0: logger(logging.DEBUG, 'List of replicas on %s for which the pfns have been found:', rse_expr) for i in recoverable_replicas[vo][rse_expr]: logger(logging.DEBUG, '%s', i) # Log file is long and hard to read -> implement some spacing logger(logging.INFO, 'All RSEs have been checked for suspicious replicas. Total time: %s seconds.', time.time() - start) logger(logging.INFO, 'Begin check for problematic RSEs.') time_start_check_probl = time.time() # If an RSE has more than *limit_suspicious_files_on_rse* suspicious files, then there might be a problem with the RSE. # The suspicious files are marked as temporarily unavailable. list_problematic_rses = [] for rse_key in list(recoverable_replicas[vo].keys()): if len(recoverable_replicas[vo][rse_key].values()) > limit_suspicious_files_on_rse: list_problematic_rses.append(rse_key) surls_list = [] for replica_value in recoverable_replicas[vo][rse_key].values(): surls_list.append(replica_value['surl']) add_bad_pfns(pfns=surls_list, account=InternalAccount('root', vo=vo), state='TEMPORARY_UNAVAILABLE', expires_at=datetime.utcnow() + timedelta(days=3)) logger(logging.INFO, "%s is problematic (more than %s suspicious replicas). Send a Jira ticket for the RSE (to be implemented).", rse_key, limit_suspicious_files_on_rse) logger(logging.INFO, "The following files on %s have been marked as TEMPORARILY UNAVAILABLE:", rse_key) for rse_values in recoverable_replicas[vo][rse_key].values(): logger(logging.INFO, 'Scope: %s Name: %s', rse_values['scope'], rse_values['name']) # Remove the RSE from the dictionary as it has been dealt with. del recoverable_replicas[vo][rse_key] logger(logging.INFO, "Following RSEs were deemed problematic (total: %s)", len(list_problematic_rses)) for rse in list_problematic_rses: logger(logging.INFO, "%s", rse) # Label suspicious replicas as bad if they have oher copies on other RSEs (that aren't also marked as suspicious). # If they are the last remaining copies, deal with them differently. for rse_key in list(recoverable_replicas[vo].keys()): files_to_be_declared_bad = [] files_to_be_ignored = [] # Remove RSEs from dictionary that don't have any suspicious replicas if len(recoverable_replicas[vo][rse_key]) == 0: del recoverable_replicas[vo][rse_key] continue # Get the rse_id by going to one of the suspicious replicas from that RSE and reading it from there rse_id = list(recoverable_replicas[vo][rse_key].values())[0]['rse_id'] for replica_key in list(recoverable_replicas[vo][rse_key].keys()): if recoverable_replicas[vo][rse_key][replica_key]['available_elsewhere'] is True: # Replicas with other copies on at least one other RSE can safely be labeled as bad files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) # Remove replica from dictionary del recoverable_replicas[vo][rse_key][replica_key] elif recoverable_replicas[vo][rse_key][replica_key]['available_elsewhere'] is False: if (recoverable_replicas[vo][rse_key][replica_key]['name'].startswith("log.")) or (recoverable_replicas[vo][rse_key][replica_key]['name'].startswith("user")): # Don't keep log files or user files files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) del recoverable_replicas[vo][rse_key][replica_key] else: # Deal with replicas based on their metadata. file_metadata = get_metadata(recoverable_replicas[vo][rse_key][replica_key]["scope"], recoverable_replicas[vo][rse_key][replica_key]["name"]) if file_metadata["datatype"] is None: # "None" type has no function "split()" files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) continue for i in json_data: if i["datatype"] == file_metadata["datatype"].split("_")[-1]: action = i["action"] if action == "ignore": files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) elif action == "declare bad": files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) else: logger(logging.WARNING, "RSE: %s, replica name %s, surl %s: Match for the metadata 'datatype' (%s) of replica found in json file, but no match for 'action' (%s)", rse_key, replica_key, recoverable_replicas[vo][rse_key][replica_key]['surl'], i["datatype"], i["action"]) break else: # If no policy has be set, default to ignoring the file (no action taken). files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) logger(logging.INFO, '(%s) Remaining replicas (pfns) that will be ignored:', rse_key) for i in files_to_be_ignored: logger(logging.INFO, '%s', i) logger(logging.INFO, '(%s) Remaining replica (pfns) that will be declared BAD:', rse_key) for i in files_to_be_declared_bad: logger(logging.INFO, '%s', i) if files_to_be_declared_bad: logger(logging.INFO, 'Ready to declare %s bad replica(s) on %s (RSE id: %s).', len(files_to_be_declared_bad), rse_key, str(rse_id)) declare_bad_file_replicas(pfns=files_to_be_declared_bad, reason='Suspicious. Automatic recovery.', issuer=InternalAccount('root', vo=vo), session=None) logger(logging.INFO, 'Finished declaring bad replicas on %s.\n', rse_key) logger(logging.INFO, 'Finished checking for problematic RSEs and declaring bad replicas. Total time: %s seconds.', time.time() - time_start_check_probl) time_passed = time.time() - start logger(logging.INFO, 'Total time: %s seconds', time_passed) daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) except (DatabaseException, DatabaseError) as err: if match('.*QueuePool.*', str(err.args[0])): logger(logging.WARNING, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) elif match('.*ORA-03135.*', str(err.args[0])): logger(logging.WARNING, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) else: logger(logging.CRITICAL, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) except Exception as err: logger(logging.CRITICAL, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) if once: break die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) logger(logging.INFO, 'Graceful stop done.')
def rule_cleaner(once=False, sleep_time=60): """ Main loop to check for expired replication rules """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() paused_rules = {} # {rule_id: datetime} # Make an initial heartbeat so that all judge-cleaners have the correct worker number on the next try executable = 'judge-cleaner' heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread) prefix = 'judge-cleaner[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') graceful_stop.wait(1) while not graceful_stop.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread) prefix = 'judge-cleaner[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') start = time.time() # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] rules = get_expired_rules( total_workers=heartbeat['nr_threads'], worker_number=heartbeat['assign_thread'], limit=200, blocked_rules=[key for key in paused_rules]) logger( logging.DEBUG, 'index query time %f fetch size is %d' % (time.time() - start, len(rules))) if not rules and not once: logger( logging.DEBUG, 'did not get any work (paused_rules=%s)' % str(len(paused_rules))) daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=graceful_stop, logger=logger) else: for rule in rules: rule_id = rule[0] rule_expression = rule[1] logger( logging.INFO, 'Deleting rule %s with expression %s' % (rule_id, rule_expression)) if graceful_stop.is_set(): break try: start = time.time() delete_rule(rule_id=rule_id, nowait=True) logger( logging.DEBUG, 'deletion of %s took %f' % (rule_id, time.time() - start)) except (DatabaseException, DatabaseError, UnsupportedOperation) as e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow( ) + timedelta(seconds=randint(600, 2400)) record_counter( 'rule.judge.exceptions.{exception}', labels={'exception': 'LocksDetected'}) logger(logging.WARNING, 'Locks detected for %s' % rule_id) elif match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter( 'rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter( 'rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) else: logger(logging.ERROR, 'DatabaseException', exc_info=True) record_counter( 'rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) except RuleNotFound: pass except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) else: logger(logging.CRITICAL, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) except Exception as e: logger(logging.CRITICAL, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) if once: break die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
def reaper(rses, chunk_size=100, once=False, scheme=None, sleep_time=300): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. :param chunk_size: the size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param scheme: Force the reaper to use a particular protocol, e.g., mock. :param sleep_time: Thread sleep time after each chunk of work. """ pid = os.getpid() thread = threading.current_thread() hostname = socket.gethostname() executable = ' '.join(sys.argv) hash_executable = hashlib.sha256( (sys.argv[0] + ''.join(rses)).encode()).hexdigest() sanity_check(executable=None, hostname=hostname) # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) prepend_str = 'dark-reaper [%i/%i] : ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.INFO, 'Starting Dark Reaper on RSEs: %s', ', '.join(rses)) if not once: logger(logging.INFO, 'Waiting for heartbeat synchonization') GRACEFUL_STOP.wait( 10 ) # To prevent running on the same partition if all the reapers restart at the same time while not GRACEFUL_STOP.is_set(): try: heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) prepend_str = 'dark-reaper [%i/%i] : ' % ( heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.INFO, 'Live gives {0[heartbeat]}'.format(locals())) nothing_to_do = True start_time = time.time() rses_to_process = list( set(rses) & set(list_rses_with_quarantined_replicas())) random.shuffle(rses_to_process) for rse_id in rses_to_process: # The following query returns the list of real replicas (deleted_replicas) and list of dark replicas (dark_replicas) # Real replicas can be directly removed from the quarantine table deleted_replicas, dark_replicas = list_quarantined_replicas( rse_id=rse_id, limit=chunk_size, worker_number=heartbeat['assign_thread'], total_workers=heartbeat['nr_threads']) rse_info = rsemgr.get_rse_info(rse_id=rse_id) rse = rse_info['rse'] prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme) heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) prepend_str = 'dark-reaper [%i/%i] : ' % ( heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') try: prot.connect() for replica in dark_replicas: nothing_to_do = False scope = '' if replica['scope']: scope = replica['scope'].external try: pfn = str( list( rsemgr.lfns2pfns( rse_settings=rse_info, lfns=[{ 'scope': scope, 'name': replica['name'], 'path': replica['path'] }], operation='delete', scheme=scheme).values())[0]) logger(logging.INFO, 'Deletion ATTEMPT of %s:%s as %s on %s', scope, replica['name'], pfn, rse) start = time.time() prot.delete(pfn) duration = time.time() - start logger( logging.INFO, 'Deletion SUCCESS of %s:%s as %s on %s in %s seconds', scope, replica['name'], pfn, rse, duration) payload = { 'scope': scope, 'name': replica['name'], 'rse': rse, 'rse_id': rse_id, 'file-size': replica.get('bytes') or 0, 'bytes': replica.get('bytes') or 0, 'url': pfn, 'duration': duration, 'protocol': prot.attributes['scheme'] } if replica['scope'].vo != 'def': payload['vo'] = replica['scope'].vo add_message('deletion-done', payload) deleted_replicas.append(replica) except SourceNotFound: err_msg = ( 'Deletion NOTFOUND of %s:%s as %s on %s' % (scope, replica['name'], pfn, rse)) logger(logging.WARNING, err_msg) deleted_replicas.append(replica) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: err_msg = ( 'Deletion NOACCESS of %s:%s as %s on %s: %s' % (scope, replica['name'], pfn, rse, str(error))) logger(logging.WARNING, err_msg) payload = { 'scope': scope, 'name': replica['name'], 'rse': rse, 'rse_id': rse_id, 'file-size': replica['bytes'] or 0, 'bytes': replica['bytes'] or 0, 'url': pfn, 'reason': str(error), 'protocol': prot.attributes['scheme'] } if replica['scope'].vo != 'def': payload['vo'] = replica['scope'].vo add_message('deletion-failed', payload) except Exception: logging.critical(traceback.format_exc()) finally: prot.close() delete_quarantined_replicas(rse_id=rse_id, replicas=deleted_replicas) if once: break if once: break if nothing_to_do: logger(logging.INFO, 'Nothing to do') daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) except DatabaseException as error: logging.warning('Reaper: %s', str(error)) except Exception: logging.critical(traceback.format_exc()) die(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) logging.info('Graceful stop requested') logging.info('Graceful stop done') return
def re_evaluator(once=False, sleep_time=30, did_limit=100): """ Main loop to check the re-evaluation of dids. """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() paused_dids = {} # {(scope, name): datetime} # Make an initial heartbeat so that all judge-evaluators have the correct worker number on the next try executable = 'judge-evaluator' live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=60 * 30) graceful_stop.wait(1) while not graceful_stop.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=60 * 30) start = time.time() # NOQA # Refresh paused dids paused_dids = dict((k, v) for k, v in iteritems(paused_dids) if datetime.utcnow() < v) # Select a bunch of dids for re evaluation for this worker dids = get_updated_dids(total_workers=heartbeat['nr_threads'], worker_number=heartbeat['assign_thread'], limit=did_limit, blocked_dids=[(InternalScope(key[0], fromExternal=False), key[1]) for key in paused_dids]) logging.debug('re_evaluator[%s/%s] index query time %f fetch size is %d (%d blocked)' % (heartbeat['assign_thread'], heartbeat['nr_threads'], time.time() - start, len(dids), len([(InternalScope(key[0], fromExternal=False), key[1]) for key in paused_dids]))) # If the list is empty, sent the worker to sleep if not dids and not once: logging.debug('re_evaluator[%s/%s] did not get any work (paused_dids=%s)' % (heartbeat['assign_thread'], heartbeat['nr_threads'], str(len(paused_dids)))) daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=graceful_stop) else: done_dids = {} for did in dids: if graceful_stop.is_set(): break # Check if this did has already been operated on did_tag = '%s:%s' % (did.scope.internal, did.name) if did_tag in done_dids: if did.rule_evaluation_action in done_dids[did_tag]: logging.debug('re_evaluator[%s/%s]: evaluation of %s:%s already done' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name)) delete_updated_did(id_=did.id) continue else: done_dids[did_tag] = [] # Jump paused dids if (did.scope.internal, did.name) in paused_dids: continue try: start_time = time.time() re_evaluate_did(scope=did.scope, name=did.name, rule_evaluation_action=did.rule_evaluation_action) logging.debug('re_evaluator[%s/%s]: evaluation of %s:%s took %f' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name, time.time() - start_time)) delete_updated_did(id_=did.id) done_dids[did_tag].append(did.rule_evaluation_action) except DataIdentifierNotFound: delete_updated_did(id_=did.id) except (DatabaseException, DatabaseError) as e: if match('.*ORA-000(01|54).*', str(e.args[0])): paused_dids[(did.scope.internal, did.name)] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) logging.warning('re_evaluator[%s/%s]: Locks detected for %s:%s' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name)) record_counter('rule.judge.exceptions.{exception}', labels={'exception': 'LocksDetected'}) elif match('.*QueuePool.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) except ReplicationRuleCreationTemporaryFailed as e: record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) logging.warning('re_evaluator[%s/%s]: Replica Creation temporary failed, retrying later for %s:%s' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name)) except FlushError as e: record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) logging.warning('re_evaluator[%s/%s]: Flush error for %s:%s' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name)) except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) else: logging.critical(traceback.format_exc()) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) except Exception as e: logging.critical(traceback.format_exc()) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) if once: break die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
def minos(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Worker that gets a list of bad PFNs, extract the scope, name and rse_id and fill the bad_replicas table. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'minos' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'minos[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Minos starting') time.sleep( 10 ) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'minos[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') states_mapping = { BadPFNStatus.BAD: BadFilesStatus.BAD, BadPFNStatus.SUSPICIOUS: BadFilesStatus.SUSPICIOUS, BadPFNStatus.TEMPORARY_UNAVAILABLE: BadFilesStatus.TEMPORARY_UNAVAILABLE } logger(logging.INFO, 'Minos started') chunk_size = 10 # The chunk size used for the commits while not GRACEFUL_STOP.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'minos[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') pfns = [] try: bad_replicas = {} temporary_unvailables = {} pfns = get_bad_pfns(thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], limit=bulk) # Class the PFNs into bad_replicas and temporary_unavailable for pfn in pfns: path = pfn['pfn'] account = pfn['account'] reason = pfn['reason'] expires_at = pfn['expires_at'] state = states_mapping[pfn['state']] if state in [BadFilesStatus.BAD, BadFilesStatus.SUSPICIOUS]: if (account, reason, state) not in bad_replicas: bad_replicas[(account, reason, state)] = [] bad_replicas[(account, reason, state)].append(path) elif state == BadFilesStatus.TEMPORARY_UNAVAILABLE: if (account, reason, expires_at) not in temporary_unvailables: temporary_unvailables[(account, reason, expires_at)] = [] temporary_unvailables[(account, reason, expires_at)].append(path) # Process the bad and suspicious files # The scope, name, rse_id are extracted and filled into the bad_replicas table for account, reason, state in bad_replicas: vo = account.vo pfns = bad_replicas[(account, reason, state)] logger( logging.INFO, 'Declaring %s replicas with state %s and reason %s' % (len(pfns), str(state), reason)) session = get_session() schemes = {} dict_rse = {} unknown_replicas = [] try: # Splitting the PFNs by schemes for pfn in pfns: scheme = pfn.split(':')[0] if scheme not in schemes: schemes[scheme] = [] schemes[scheme].append(pfn) for scheme in schemes: _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse( schemes[scheme], vo=vo) for rse_id in tmp_dict_rse: if rse_id not in dict_rse: dict_rse[rse_id] = {} if scheme not in dict_rse[rse_id]: dict_rse[rse_id][scheme] = [] dict_rse[rse_id][scheme].extend( tmp_dict_rse[rse_id]) unknown_replicas.extend( tmp_unknown_replicas.get('unknown', [])) # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns if unknown_replicas: logger( logging.INFO, 'The following replicas are unknown and will be removed : %s' % str(unknown_replicas)) bulk_delete_bad_pfns(pfns=unknown_replicas, session=None) for rse_id, pfns_by_scheme in dict_rse.items(): for scheme, pfns in pfns_by_scheme.items(): vo_str = '' if vo == 'def' else ' on VO ' + vo logger( logging.DEBUG, 'Running on RSE %s%s with %s replicas' % (get_rse_name(rse_id=rse_id), vo_str, len(pfns))) nchunk = 0 tot_chunk = int(math.ceil(len(pfns) / chunk_size)) for chunk in chunks(pfns, chunk_size): nchunk += 1 logger( logging.DEBUG, 'Running on %s chunk out of %s' % (nchunk, tot_chunk)) unknown_replicas = declare_bad_file_replicas( pfns=chunk, reason=reason, issuer=account, status=state, session=session) if unknown_replicas: logger( logging.DEBUG, 'Unknown replicas : %s' % (str(unknown_replicas))) bulk_delete_bad_pfns(pfns=chunk, session=session) session.commit() # pylint: disable=no-member except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match( '.*ORA-00060.*', error.args[0] ) or 'ERROR 1205 (HY000)' in error.args[0]: logger( logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: logger(logging.ERROR, 'Exception', exc_info=True) session.rollback() # pylint: disable=no-member except Exception: session.rollback() # pylint: disable=no-member logger(logging.CRITICAL, 'Exception', exc_info=True) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) # Now get the temporary unavailable and update the replicas states for account, reason, expires_at in temporary_unvailables: vo = account.vo pfns = temporary_unvailables[(account, reason, expires_at)] logger( logging.INFO, 'Declaring %s replicas temporary available with timeout %s and reason %s' % (len(pfns), str(expires_at), reason)) logger(logging.DEBUG, 'Extracting RSEs') schemes = {} dict_rse = {} unknown_replicas = [] # Splitting the PFNs by schemes for pfn in pfns: scheme = pfn.split(':')[0] if scheme not in schemes: schemes[scheme] = [] schemes[scheme].append(pfn) for scheme in schemes: _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse( schemes[scheme], vo=vo) for rse_id in tmp_dict_rse: if rse_id not in dict_rse: dict_rse[rse_id] = [] dict_rse[rse_id].extend(tmp_dict_rse[rse_id]) unknown_replicas.extend( tmp_unknown_replicas.get('unknown', [])) # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns if unknown_replicas: logger( logging.INFO, 'The following replicas are unknown and will be removed : %s' % str(unknown_replicas)) bulk_delete_bad_pfns(pfns=unknown_replicas, session=None) for rse_id in dict_rse: replicas = [] rse = get_rse_name(rse_id=rse_id, session=None) rse_vo_str = rse if vo == 'def' else '{} on {}'.format( rse, vo) logger(logging.DEBUG, 'Running on RSE %s' % rse_vo_str) for rep in get_did_from_pfns(pfns=dict_rse[rse_id], rse_id=None, vo=vo, session=None): for pfn in rep: scope = rep[pfn]['scope'] name = rep[pfn]['name'] replicas.append({ 'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.TEMPORARY_UNAVAILABLE, 'pfn': pfn }) # The following part needs to be atomic # We update the replicas states to TEMPORARY_UNAVAILABLE # then insert a row in the bad_replicas table. TODO Update the row if it already exists # then delete the corresponding rows into the bad_pfns table logger( logging.DEBUG, 'Running on %s replicas on RSE %s' % (len(replicas), rse_vo_str)) nchunk = 0 tot_chunk = int( math.ceil(len(replicas) / float(chunk_size))) session = get_session() for chunk in chunks(replicas, chunk_size): try: nchunk += 1 logger( logging.DEBUG, 'Running on %s chunk out of %s' % (nchunk, tot_chunk)) update_replicas_states(chunk, nowait=False, session=session) bulk_add_bad_replicas( chunk, account, state=BadFilesStatus.TEMPORARY_UNAVAILABLE, reason=reason, expires_at=expires_at, session=session) pfns = [entry['pfn'] for entry in chunk] bulk_delete_bad_pfns(pfns=pfns, session=session) session.commit() # pylint: disable=no-member except (UnsupportedOperation, ReplicaNotFound) as error: session.rollback() # pylint: disable=no-member logger( logging.ERROR, 'Problem to bulk update PFNs. PFNs will be updated individually. Error : %s' % str(error)) for rep in chunk: logger(logging.DEBUG, 'Working on %s' % (str(rep))) try: get_metadata(rep['scope'], rep['name']) unavailable_states = [] rep_state = get_replicas_state( rep['scope'], rep['name']) unavailable_states.extend( rep_state.get( ReplicaState.TEMPORARY_UNAVAILABLE, [])) unavailable_states.extend( rep_state.get( ReplicaState.BEING_DELETED, [])) unavailable_states.extend( rep_state.get(ReplicaState.BAD, [])) # If the replica is already not available, it is removed from the bad PFNs table if rep['rse_id'] in unavailable_states: logger( logging.INFO, '%s is in unavailable state. Will be removed from the list of bad PFNs' % str(rep['pfn'])) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) # If the expiration date of the TEMPORARY_UNAVAILABLE is in the past, it is removed from the bad PFNs table elif expires_at < datetime.now(): logger( logging.INFO, 'PFN %s expiration time (%s) is older than now and is not in unavailable state. Removing the PFNs from bad_pfns' % (str(rep['pfn']), expires_at)) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) # Else update everything in the same transaction else: try: session = get_session() update_replicas_states( [rep], nowait=False, session=session) bulk_add_bad_replicas( [rep], account, state=BadFilesStatus. TEMPORARY_UNAVAILABLE, reason=reason, expires_at=expires_at, session=session) bulk_delete_bad_pfns( pfns=[rep['pfn']], session=session) session.commit() # pylint: disable=no-member except Exception: logger( logging.ERROR, 'Cannot update state of %s' % str(rep['pfn'])) except (DataIdentifierNotFound, ReplicaNotFound): logger( logging.ERROR, 'Will remove %s from the list of bad PFNs' % str(rep['pfn'])) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) session = get_session() except (DatabaseException, DatabaseError) as error: if re.match( '.*ORA-00054.*', error.args[0]) or re.match( '.*ORA-00060.*', error.args[0] ) or 'ERROR 1205 (HY000)' in error.args[0]: logger( logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: logger(logging.ERROR, 'Exception', exc_info=True) session.rollback() # pylint: disable=no-member session = get_session() except Exception: session.rollback() # pylint: disable=no-member logger(logging.CRITICAL, 'Exception', exc_info=True) session = get_session() except Exception as error: logger(logging.ERROR, '%s' % (str(error))) if once: break if len(pfns) == bulk: logger( logging.INFO, 'Processed maximum number of pfns according to the bulk size. Restart immediately next cycle' ) else: daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP, logger=logger) heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop requested') logger(logging.INFO, 'Graceful stop done')
def necromancer(thread=0, bulk=5, once=False, sleep_time=60): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Thread sleep time after each chunk of work. """ executable = 'necromancer' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not GRACEFUL_STOP.is_set(): stime = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.DEBUG, 'Starting new cycle') # Check if there is a Judge Evaluator backlog try: max_evaluator_backlog_count = config_get('necromancer', 'max_evaluator_backlog_count') max_evaluator_backlog_count = int(max_evaluator_backlog_count) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_evaluator_backlog_count = None try: max_evaluator_backlog_duration = config_get('necromancer', 'max_evaluator_backlog_duration') max_evaluator_backlog_duration = int(max_evaluator_backlog_duration) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_evaluator_backlog_duration = None if max_evaluator_backlog_count or max_evaluator_backlog_duration: evaluator_backlog_count, evaluator_backlog_duration = get_evaluation_backlog(expiration_time=sleep_time) if max_evaluator_backlog_count and \ evaluator_backlog_count and \ max_evaluator_backlog_duration and \ evaluator_backlog_duration and \ evaluator_backlog_count > max_evaluator_backlog_count and \ evaluator_backlog_duration < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration): logger(logging.ERROR, 'Necromancer: Judge evaluator backlog count and duration hit, stopping operation') GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_count and evaluator_backlog_count and evaluator_backlog_count > max_evaluator_backlog_count: logger(logging.ERROR, 'Necromancer: Judge evaluator backlog count hit, stopping operation') GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_duration and evaluator_backlog_duration and evaluator_backlog_duration < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration): logger(logging.ERROR, 'Necromancer: Judge evaluator backlog duration hit, stopping operation') GRACEFUL_STOP.wait(30) continue # Check how many bad replicas are queued try: max_bad_replicas_backlog_count = config_get('necromancer', 'max_bad_replicas_backlog_count') max_bad_replicas_backlog_count = int(max_bad_replicas_backlog_count) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_bad_replicas_backlog_count = None bad_replicas_backlog = REGION.get('bad_replicas_backlog') if bad_replicas_backlog is NO_VALUE: bad_replicas_backlog = get_bad_replicas_backlog() REGION.set('bad_replicas_backlog', bad_replicas_backlog) tot_bad_files = sum([bad_replicas_backlog[key] for key in bad_replicas_backlog]) list_of_rses = list() # If too many replica, call list_bad_replicas with a list of RSEs if max_bad_replicas_backlog_count and tot_bad_files > max_bad_replicas_backlog_count and len(bad_replicas_backlog) > 1: logger(logging.INFO, 'Backlog of bads replica too big. Apply some sharing between different RSEs') rses = list() cnt = 0 for key in sorted(bad_replicas_backlog, key=bad_replicas_backlog.get, reverse=True): rses.append({'id': key}) cnt += bad_replicas_backlog[key] if cnt >= bulk: list_of_rses.append(rses) rses = list() cnt = 0 else: list_of_rses.append(None) tot_processed = 0 if tot_bad_files == 0: logger(logging.INFO, 'No bad replicas to process.') else: ttime = time.time() replicas = [] try: for rses in list_of_rses: replicas = list_bad_replicas(limit=bulk, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], rses=rses) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse'] logger(logging.INFO, 'Working on %s:%s on %s' % (scope, name, rse)) list_replicas = get_replicas_state(scope=scope, name=name) if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas: logger(logging.INFO, 'File %s:%s has no other available or temporary available replicas, it will be marked as lost' % (scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(name='necromancer.badfiles.lostfile') except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: logger(logging.ERROR, str(error)) else: rep = list_replicas.get(ReplicaState.AVAILABLE, []) unavailable_rep = list_replicas.get(ReplicaState.TEMPORARY_UNAVAILABLE, []) logger(logging.INFO, 'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s' % (scope, name, str(rep), str(unavailable_rep))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(name='necromancer.badfiles.recovering') except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: logger(logging.ERROR, str(error)) tot_processed += len(replicas) logger(logging.INFO, 'It took %s seconds to process %s replicas' % (str(time.time() - ttime), str(len(replicas)))) except Exception: exc_type, exc_value, exc_traceback = exc_info() logger(logging.CRITICAL, ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip()) if once: break elif tot_processed == 0 or tot_bad_files == 0: daemon_sleep(start_time=stime, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP, logger=logger) logger(logging.INFO, 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop done')
def transmogrifier(bulk=5, once=False, sleep_time=60): """ Creates a Transmogrifier Worker that gets a list of new DIDs for a given hash, identifies the subscriptions matching the DIDs and submit a replication rule for each DID matching a subscription. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'transmogrifier' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not graceful_stop.is_set(): heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) dids, subscriptions = [], [] tottime = 0 prepend_str = 'transmogrifier[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') try: # Get the new DIDs based on the is_new flag for did in list_new_dids(thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], chunk_size=bulk, did_type=None): dids.append({'scope': did['scope'], 'did_type': str(did['did_type']), 'name': did['name']}) sub_dict = {3: []} # Get the list of subscriptions. The default priority of the subscription is 3. 0 is the highest priority, 5 the lowest # The priority is defined as 'policyid' for sub in list_subscriptions(None, None): if sub['state'] != SubscriptionState.INACTIVE and sub['lifetime'] and (datetime.now() > sub['lifetime']): update_subscription(name=sub['name'], account=sub['account'], metadata={'state': SubscriptionState.INACTIVE}) elif sub['state'] in [SubscriptionState.ACTIVE, SubscriptionState.UPDATED]: priority = 3 if 'policyid' in sub: if int(sub['policyid']) not in sub_dict: sub_dict[int(sub['policyid'])] = [] priority = int(sub['policyid']) sub_dict[priority].append(sub) priorities = list(sub_dict.keys()) priorities.sort() # Order the subscriptions according to their priority for priority in priorities: subscriptions.extend(sub_dict[priority]) except SubscriptionNotFound as error: logger(logging.WARNING, 'No subscriptions defined: %s' % (str(error))) time.sleep(10) continue except Exception as error: logger(logging.ERROR, 'Failed to get list of new DIDs or subscriptions: %s' % (str(error))) try: results = {} start_time = time.time() blocklisted_rse_id = [rse['id'] for rse in list_rses({'availability_write': False})] logger(logging.DEBUG, 'In transmogrifier worker') identifiers = [] # Loop over all the new dids for did in dids: did_success = True if did['did_type'] == str(DIDType.DATASET) or did['did_type'] == str(DIDType.CONTAINER): did_tag = '%s:%s' % (did['scope'].internal, did['name']) results[did_tag] = [] try: metadata = get_metadata(did['scope'], did['name']) # Loop over all the subscriptions for subscription in subscriptions: # Check if the DID match the subscription if is_matching_subscription(subscription, did, metadata) is True: filter_string = loads(subscription['filter']) split_rule = filter_string.get('split_rule', False) stime = time.time() results[did_tag].append(subscription['id']) logger(logging.INFO, '%s:%s matches subscription %s' % (did['scope'], did['name'], subscription['name'])) rules = loads(subscription['replication_rules']) created_rules = {} cnt = 0 for rule_dict in rules: cnt += 1 created_rules[cnt] = [] # Get all the rule and subscription parameters grouping = rule_dict.get('grouping', 'DATASET') lifetime = rule_dict.get('lifetime', None) ignore_availability = rule_dict.get('ignore_availability', None) weight = rule_dict.get('weight', None) source_replica_expression = rule_dict.get('source_replica_expression', None) locked = rule_dict.get('locked', None) if locked == 'True': locked = True else: locked = False purge_replicas = rule_dict.get('purge_replicas', False) if purge_replicas == 'True': purge_replicas = True else: purge_replicas = False rse_expression = str(rule_dict['rse_expression']) comment = str(subscription['comments']) if 'comments' in rule_dict: comment = str(rule_dict['comments']) subscription_id = str(subscription['id']) account = subscription['account'] copies = int(rule_dict['copies']) activity = rule_dict.get('activity', 'User Subscriptions') try: validate_schema(name='activity', obj=activity, vo=account.vo) except InputValidationError as error: logger(logging.ERROR, 'Error validating the activity %s' % (str(error))) activity = 'User Subscriptions' if lifetime: lifetime = int(lifetime) str_activity = "".join(activity.split()) success = False nattempt = 5 attemptnr = 0 skip_rule_creation = False selected_rses = [] chained_idx = rule_dict.get('chained_idx', None) if chained_idx: params = {} if rule_dict.get('associated_site_idx', None): params['associated_site_idx'] = rule_dict.get('associated_site_idx', None) logger(logging.DEBUG, '%s Chained subscription identified. Will use %s', prepend_str, str(created_rules[chained_idx])) algorithm = rule_dict.get('algorithm', None) selected_rses = select_algorithm(algorithm, created_rules[chained_idx], params) else: # In the case of chained subscription, don't use rseselector but use the rses returned by the algorithm if split_rule: preferred_rses = set() for rule in list_rules(filters={'subscription_id': subscription_id, 'scope': did['scope'], 'name': did['name']}): for rse_dict in parse_expression(rule['rse_expression'], filter_={'vo': account.vo}): preferred_rses.add(rse_dict['rse']) preferred_rses = list(preferred_rses) try: selected_rses, preferred_unmatched = resolve_rse_expression(rse_expression, account, weight=weight, copies=copies, size=0, preferred_rses=preferred_rses, blocklist=blocklisted_rse_id) except (InsufficientTargetRSEs, InsufficientAccountLimit, InvalidRuleWeight, RSEOverQuota) as error: logger(logging.WARNING, 'Problem getting RSEs for subscription "%s" for account %s : %s. Try including blocklisted sites' % (subscription['name'], account, str(error))) # Now including the blocklisted sites try: selected_rses, preferred_unmatched = resolve_rse_expression(rse_expression, account, weight=weight, copies=copies, size=0, preferred_rses=preferred_rses) ignore_availability = True except (InsufficientTargetRSEs, InsufficientAccountLimit, InvalidRuleWeight, RSEOverQuota) as error: logger(logging.ERROR, 'Problem getting RSEs for subscription "%s" for account %s : %s. Skipping rule creation.' % (subscription['name'], account, str(error))) monitor.record_counter(name='transmogrifier.addnewrule.errortype.{exception}', labels={'exception': str(error.__class__.__name__)}) # The DID won't be reevaluated at the next cycle did_success = did_success and True continue if len(preferred_rses) - len(preferred_unmatched) >= copies: skip_rule_creation = True for attempt in range(0, nattempt): attemptnr = attempt nb_rule = 0 # Try to create the rule try: if split_rule: if not skip_rule_creation: for rse in selected_rses: if isinstance(selected_rses, dict): source_replica_expression = selected_rses[rse].get('source_replica_expression', None) weight = selected_rses[rse].get('weight', None) logger(logging.INFO, 'Will insert one rule for %s:%s on %s' % (did['scope'], did['name'], rse)) rule_ids = add_rule(dids=[{'scope': did['scope'], 'name': did['name']}], account=account, copies=1, rse_expression=rse, grouping=grouping, weight=weight, lifetime=lifetime, locked=locked, subscription_id=subscription_id, source_replica_expression=source_replica_expression, activity=activity, purge_replicas=purge_replicas, ignore_availability=ignore_availability, comment=comment) created_rules[cnt].append(rule_ids[0]) nb_rule += 1 if nb_rule == copies: success = True break else: rule_ids = add_rule(dids=[{'scope': did['scope'], 'name': did['name']}], account=account, copies=copies, rse_expression=rse_expression, grouping=grouping, weight=weight, lifetime=lifetime, locked=locked, subscription_id=subscription['id'], source_replica_expression=source_replica_expression, activity=activity, purge_replicas=purge_replicas, ignore_availability=ignore_availability, comment=comment) created_rules[cnt].append(rule_ids[0]) nb_rule += 1 monitor.record_counter(name='transmogrifier.addnewrule.done', delta=nb_rule) monitor.record_counter(name='transmogrifier.addnewrule.activity.{activity}', delta=nb_rule, labels={'activity': str_activity}) success = True break except (InvalidReplicationRule, InvalidRuleWeight, InvalidRSEExpression, StagingAreaRuleRequiresLifetime, DuplicateRule) as error: # Errors that won't be retried success = True logger(logging.ERROR, str(error)) monitor.record_counter(name='transmogrifier.addnewrule.errortype.{exception}', labels={'exception': str(error.__class__.__name__)}) break except (ReplicationRuleCreationTemporaryFailed, InsufficientTargetRSEs, InsufficientAccountLimit, DatabaseException, RSEWriteBlocked) as error: # Errors to be retried logger(logging.ERROR, '%s Will perform an other attempt %i/%i' % (str(error), attempt + 1, nattempt)) monitor.record_counter(name='transmogrifier.addnewrule.errortype.{exception}', labels={'exception': str(error.__class__.__name__)}) except Exception: # Unexpected errors monitor.record_counter(name='transmogrifier.addnewrule.errortype.{exception}', labels={'exception': 'unknown'}) logger(logging.ERROR, "Unexpected error", exc_info=True) did_success = (did_success and success) if (attemptnr + 1) == nattempt and not success: logger(logging.ERROR, 'Rule for %s:%s on %s cannot be inserted' % (did['scope'], did['name'], rse_expression)) else: logger(logging.INFO, '%s rule(s) inserted in %f seconds' % (str(nb_rule), time.time() - stime)) except DataIdentifierNotFound as error: logger(logging.WARNING, str(error)) if did_success: if did['did_type'] == str(DIDType.FILE): monitor.record_counter(name='transmogrifier.did.file.processed') elif did['did_type'] == str(DIDType.DATASET): monitor.record_counter(name='transmogrifier.did.dataset.processed') elif did['did_type'] == str(DIDType.CONTAINER): monitor.record_counter(name='transmogrifier.did.container.processed', delta=1) monitor.record_counter(name='transmogrifier.did.processed', delta=1) identifiers.append({'scope': did['scope'], 'name': did['name'], 'did_type': did['did_type']}) time1 = time.time() # Mark the DIDs as processed for identifier in chunks(identifiers, 100): _retrial(set_new_dids, identifier, None) logger(logging.DEBUG, 'Time to set the new flag : %f' % (time.time() - time1)) tottime = time.time() - start_time for sub in subscriptions: update_subscription(name=sub['name'], account=sub['account'], metadata={'last_processed': datetime.now()}) logger(logging.INFO, 'It took %f seconds to process %i DIDs' % (tottime, len(dids))) logger(logging.DEBUG, 'DIDs processed : %s' % (str(dids))) monitor.record_counter(name='transmogrifier.job.done', delta=1) monitor.record_timer(name='transmogrifier.job.duration', time=1000 * tottime) except Exception: logger(logging.ERROR, "Failed to run transmogrifier", exc_info=True) monitor.record_counter(name='transmogrifier.job.error', delta=1) monitor.record_counter(name='transmogrifier.addnewrule.error', delta=1) if once is True: break daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=graceful_stop, logger=logger) heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop requested') logger(logging.INFO, 'Graceful stop done')
def OAuthManager(once=False, loop_rate=300, max_rows=100, sleep_time=300): """ Main loop to delete all expired tokens, refresh tokens eligible for refresh and delete all expired OAuth session parameters. It was decided to have only 1 daemon for all 3 of these cleanup activities. :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped. :param loop_rate: obsolete, please use sleep_time instead. The number of seconds the daemon will wait before running next loop of operations. :param max_rows: Max number of DB rows to deal with per operation. :param sleep_time: The number of seconds the daemon will wait before running next loop of operations. :returns: None """ if sleep_time == OAuthManager.__defaults__[ 3] and loop_rate != OAuthManager.__defaults__[1]: sleep_time = loop_rate executable = 'oauth-manager' total_workers = 0 worker_number = 0 sanity_check(executable=executable, hostname=socket.gethostname()) # make an initial heartbeat live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) # wait a moment in case all workers started at the same time GRACEFUL_STOP.wait(1) while not GRACEFUL_STOP.is_set(): start = time.time() # issuing the heartbeat for a second time to make all workers aware of each other heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) total_workers = heartbeat['nr_threads'] worker_number = heartbeat['assign_thread'] + 1 ndeleted = 0 ndeletedreq = 0 nrefreshed = 0 try: # ACCESS TOKEN REFRESH - better to run first (in case some of the refreshed tokens needed deletion after this step) logging.info( 'oauth_manager[%i/%i]: ----- START ----- ACCESS TOKEN REFRESH ----- ', worker_number, total_workers) logging.info( 'oauth_manager[%i/%i]: starting to query tokens for automatic refresh', worker_number, total_workers) nrefreshed = refresh_jwt_tokens(total_workers, worker_number, refreshrate=int(sleep_time), limit=max_rows) logging.info( 'oauth_manager[%i/%i]: successfully refreshed %i tokens', worker_number, total_workers, nrefreshed) logging.info( 'oauth_manager[%i/%i]: ----- END ----- ACCESS TOKEN REFRESH ----- ', worker_number, total_workers) record_counter(counters='oauth_manager.tokens.refreshed', delta=nrefreshed) except (DatabaseException, DatabaseError) as err: if match('.*QueuePool.*', str(err.args[0])): logging.warning(traceback.format_exc()) record_counter('oauth_manager.exceptions.' + err.__class__.__name__) elif match('.*ORA-03135.*', str(err.args[0])): logging.warning(traceback.format_exc()) record_counter('oauth_manager.exceptions.' + err.__class__.__name__) else: logging.critical(traceback.format_exc()) record_counter('oauth_manager.exceptions.' + err.__class__.__name__) except Exception as err: logging.critical(traceback.format_exc()) record_counter('oauth_manager.exceptions.' + err.__class__.__name__) try: # waiting 1 sec as DBs does not store milisecond and tokens # eligible for deletion after refresh might not get dleeted otherwise GRACEFUL_STOP.wait(1) # EXPIRED TOKEN DELETION logging.info( 'oauth_manager[%i/%i]: ----- START ----- DELETION OF EXPIRED TOKENS ----- ', worker_number, total_workers) logging.info( 'oauth_manager[%i/%i]: starting to delete expired tokens', worker_number, total_workers) ndeleted += delete_expired_tokens(total_workers, worker_number, limit=max_rows) logging.info('oauth_manager[%i/%i]: deleted %i expired tokens', worker_number, total_workers, ndeleted) logging.info( 'oauth_manager[%i/%i]: ----- END ----- DELETION OF EXPIRED TOKENS ----- ', worker_number, total_workers) record_counter(counters='oauth_manager.tokens.deleted', delta=ndeleted) except (DatabaseException, DatabaseError) as err: if match('.*QueuePool.*', str(err.args[0])): logging.warning(traceback.format_exc()) record_counter('oauth_manager.exceptions.' + err.__class__.__name__) elif match('.*ORA-03135.*', str(err.args[0])): logging.warning(traceback.format_exc()) record_counter('oauth_manager.exceptions.' + err.__class__.__name__) else: logging.critical(traceback.format_exc()) record_counter('oauth_manager.exceptions.' + err.__class__.__name__) except Exception as err: logging.critical(traceback.format_exc()) record_counter('oauth_manager.exceptions.' + err.__class__.__name__) try: # DELETING EXPIRED OAUTH SESSION PARAMETERS logging.info( 'oauth_manager[%i/%i]: ----- START ----- DELETION OF EXPIRED OAUTH SESSION REQUESTS ----- ', worker_number, total_workers) logging.info( 'oauth_manager[%i/%i]: starting deletion of expired OAuth session requests', worker_number, total_workers) ndeletedreq += delete_expired_oauthrequests(total_workers, worker_number, limit=max_rows) logging.info( 'oauth_manager[%i/%i]: expired parameters of %i authentication requests were deleted', worker_number, total_workers, ndeletedreq) logging.info( 'oauth_manager[%i/%i]: ----- END ----- DELETION OF EXPIRED OAUTH SESSION REQUESTS ----- ', worker_number, total_workers) record_counter(counters='oauth_manager.oauthreq.deleted', delta=ndeletedreq) except (DatabaseException, DatabaseError) as err: if match('.*QueuePool.*', str(err.args[0])): logging.warning(traceback.format_exc()) record_counter('oauth_manager.exceptions.' + err.__class__.__name__) elif match('.*ORA-03135.*', str(err.args[0])): logging.warning(traceback.format_exc()) record_counter('oauth_manager.exceptions.' + err.__class__.__name__) else: logging.critical(traceback.format_exc()) record_counter('oauth_manager.exceptions.' + err.__class__.__name__) except Exception as err: logging.critical(traceback.format_exc()) record_counter('oauth_manager.exceptions.' + err.__class__.__name__) tottime = time.time() - start logging.info( 'oauth_manager[%i/%i]: took %f seconds to delete %i tokens, %i session parameters and refreshed %i tokens', worker_number, total_workers, tottime, ndeleted, ndeletedreq, nrefreshed) record_timer(stat='oauth_manager.duration', time=1000 * tottime) if once: break else: daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) logging.info('oauth_manager[%i/%i]: graceful stop done', worker_number, total_workers)
def finisher(once=False, sleep_time=60, activities=None, bulk=100, db_bulk=1000, partition_wait_time=10): """ Main loop to update the replicas and rules based on finished requests. """ try: conveyor_config = {item[0]: item[1] for item in items('conveyor')} except ConfigNotFound: logging.log(logging.INFO, 'No configuration found for conveyor') conveyor_config = {} # Get suspicious patterns suspicious_patterns = conveyor_config.get('suspicious_pattern', []) if suspicious_patterns: pattern = str(suspicious_patterns) patterns = pattern.split(",") suspicious_patterns = [re.compile(pat.strip()) for pat in patterns] logging.log(logging.DEBUG, "Suspicious patterns: %s" % [pat.pattern for pat in suspicious_patterns]) retry_protocol_mismatches = conveyor_config.get('retry_protocol_mismatches', False) logger_prefix = executable = 'conveyor-finisher' if activities: activities.sort() executable += '--activities ' + str(activities) with HeartbeatHandler(executable=executable, logger_prefix=logger_prefix) as heartbeat_handler: logger = heartbeat_handler.logger logger(logging.INFO, 'Finisher starting - db_bulk(%i) bulk (%i)', db_bulk, bulk) if partition_wait_time: graceful_stop.wait(partition_wait_time) while not graceful_stop.is_set(): start_time = time.time() try: heart_beat, logger = heartbeat_handler.live(older_than=3600) if activities is None: activities = [None] for activity in activities: logger(logging.DEBUG, 'Working on activity %s', activity) time1 = time.time() reqs = request_core.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT], state=[RequestState.DONE, RequestState.FAILED, RequestState.LOST, RequestState.SUBMITTING, RequestState.SUBMISSION_FAILED, RequestState.NO_SOURCES, RequestState.ONLY_TAPE_SOURCES, RequestState.MISMATCH_SCHEME], limit=db_bulk, older_than=datetime.datetime.utcnow(), total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], mode_all=True, hash_variable='rule_id') record_timer('daemons.conveyor.finisher.get_next', (time.time() - time1) * 1000) time2 = time.time() if reqs: logger(logging.DEBUG, 'Updating %i requests for activity %s', len(reqs), activity) for chunk in chunks(reqs, bulk): try: time3 = time.time() __handle_requests(chunk, suspicious_patterns, retry_protocol_mismatches, logger=logger) record_timer('daemons.conveyor.finisher.handle_requests_time', (time.time() - time3) * 1000 / (len(chunk) if chunk else 1)) record_counter('daemons.conveyor.finisher.handle_requests', delta=len(chunk)) except Exception as error: logger(logging.WARNING, '%s', str(error)) if reqs: logger(logging.DEBUG, 'Finish to update %s finished requests for activity %s in %s seconds', len(reqs), activity, time.time() - time2) except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: logger(logging.ERROR, 'Exception', exc_info=True) except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) if once: raise if once: break daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=graceful_stop, logger=logger)
def rule_injector(once=False, sleep_time=60): """ Main loop to check for asynchronous creation of replication rules """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() paused_rules = {} # {rule_id: datetime} # Make an initial heartbeat so that all judge-inectors have the correct worker number on the next try executable = 'judge-injector' heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=2 * 60 * 60) prefix = 'judge-injector[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') graceful_stop.wait(1) while not graceful_stop.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=2 * 60 * 60) prefix = 'judge-injector[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') start = time.time() # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] rules = get_injected_rules(total_workers=heartbeat['nr_threads'], worker_number=heartbeat['assign_thread'], limit=100, blocked_rules=[key for key in paused_rules]) logger(logging.DEBUG, 'index query time %f fetch size is %d' % (time.time() - start, len(rules))) if not rules and not once: logger(logging.DEBUG, 'did not get any work (paused_rules=%s)' % str(len(paused_rules))) daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=graceful_stop, logger=logger) else: for rule in rules: rule_id = rule[0] logger(logging.INFO, 'Injecting rule %s' % rule_id) if graceful_stop.is_set(): break try: start = time.time() inject_rule(rule_id=rule_id, logger=logger) logger(logging.DEBUG, 'injection of %s took %f' % (rule_id, time.time() - start)) except (DatabaseException, DatabaseError) as e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) record_counter('rule.judge.exceptions.LocksDetected') logger(logging.WARNING, 'Locks detected for %s' % rule_id) elif match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logger(logging.ERROR, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except (RSEWriteBlocked) as e: paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) logger(logging.WARNING, 'RSEWriteBlocked for rule %s' % rule_id) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except ReplicationRuleCreationTemporaryFailed as e: paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) logger(logging.WARNING, 'ReplicationRuleCreationTemporaryFailed for rule %s' % rule_id) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except RuleNotFound: pass except InsufficientAccountLimit: # A rule with InsufficientAccountLimit on injection hangs there potentially forever # It should be marked as SUSPENDED logger(logging.INFO, 'Marking rule %s as SUSPENDED due to InsufficientAccountLimit' % rule_id) update_rule(rule_id=rule_id, options={'state': 'SUSPENDED'}) except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logger(logging.CRITICAL, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except Exception as e: logger(logging.CRITICAL, 'Exception', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) if once: break die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Temporary Unavailable Replicas Expiration Worker that gets the list of expired TU replicas and sets them back to AVAILABLE. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'minos-temporary-expiration' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'minos_temporary_expiration[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Minos Temporary Expiration starting') time.sleep(10) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Minos Temporary Expiration started') chunk_size = 10 # The chunk size used for the commits while not graceful_stop.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) try: # Get list of expired TU replicas logger(logging.INFO, 'Getting list of expired replicas') expired_replicas = list_expired_temporary_unavailable_replicas(total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], limit=1000) logger(logging.INFO, '%s expired replicas returned', len(expired_replicas)) logger(logging.DEBUG, 'List of expired replicas returned %s', str(expired_replicas)) replicas = [] bad_replicas = [] nchunk = 0 tot_chunk = int(math.ceil(len(expired_replicas) / float(chunk_size))) session = get_session() for chunk in chunks(expired_replicas, chunk_size): heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) skip_replica_update = [] # Process and update the replicas in chunks for replica in chunk: scope, name, rse_id = replica[0], replica[1], replica[2] states_dictionary = get_replicas_state(scope=scope, name=name, session=session) # Check if the replica is not declared bad # If already declared bad don't update the replica state, but remove from bad_pfns if not (ReplicaState.BAD in states_dictionary and rse_id in states_dictionary[ReplicaState.BAD]): replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}) else: skip_replica_update.append((scope, name)) # Remove the replicas from bad_replicas table in chunks bad_replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}) try: nchunk += 1 logger(logging.DEBUG, 'Running on %s chunk out of %s', nchunk, tot_chunk) update_replicas_states(replicas, nowait=True, session=session) bulk_delete_bad_replicas(bad_replicas, session=session) session.commit() # pylint: disable=no-member except (ReplicaNotFound, DataIdentifierNotFound) as error: session.rollback() # pylint: disable=no-member logger(logging.WARNING, 'One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s', str(error)) for replica in chunk: scope, name, rse_id = replica[0], replica[1], replica[2] logger(logging.DEBUG, 'Working on %s:%s on %s', scope, name, rse_id) try: # First check if the DID exists get_metadata(scope, name) if (scope, name) not in skip_replica_update: update_replicas_states([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}, ], nowait=True, session=session) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member except DataIdentifierNotFound: session.rollback() # pylint: disable=no-member logger(logging.WARNING, 'DID %s:%s does not exist anymore.', scope, name) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member except ReplicaNotFound: session.rollback() # pylint: disable=no-member logger(logging.WARNING, 'Replica %s:%s on RSEID %s does not exist anymore.', scope, name, rse_id) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member session = get_session() except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: logger(logging.ERROR, 'Exception', exc_info=True) session.rollback() session = get_session() except Exception: session.rollback() # pylint: disable=no-member logger(logging.CRITICAL, str(traceback.format_exc())) session = get_session() except Exception: logger(logging.CRITICAL, str(traceback.format_exc())) if once: break daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=graceful_stop) heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop requested') logger(logging.INFO, 'Graceful stop done')
def reaper(rses=[], worker_number=0, total_workers=1, chunk_size=100, once=False, scheme=None, sleep_time=60): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param worker_number: The worker number. :param total_workers: The total number of workers. :param chunk_size: the size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param scheme: Force the reaper to use a particular protocol, e.g., mock. :param sleep_time: Thread sleep time after each chunk of work. """ logging.info('Starting Light Reaper %s-%s: Will work on RSEs: %s', worker_number, total_workers, ', '.join([rse['rse'] for rse in rses])) pid = os.getpid() thread = threading.current_thread() hostname = socket.gethostname() executable = ' '.join(sys.argv) hash_executable = hashlib.sha256(sys.argv[0] + ''.join([rse['rse'] for rse in rses])).hexdigest() sanity_check(executable=None, hostname=hostname) while not GRACEFUL_STOP.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) prepend_str = 'light-reaper [%i/%i] : ' % ( heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.INFO, 'Live gives {0[heartbeat]}'.format(locals())) nothing_to_do = True start_time = time.time() random.shuffle(rses) for rse in rses: rse_id = rse['id'] rse = rse['rse'] replicas = list_expired_temporary_dids( rse_id=rse_id, limit=chunk_size, worker_number=worker_number, total_workers=total_workers) rse_info = rsemgr.get_rse_info(rse_id=rse_id) rse_protocol = rse_core.get_rse_protocols(rse_id=rse_id) prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme) deleted_replicas = [] try: prot.connect() for replica in replicas: nothing_to_do = False try: # pfn = str(rsemgr.lfns2pfns(rse_settings=rse_info, # lfns=[{'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path']}], # operation='delete', scheme=scheme).values()[0]) pfn = 's3://%s%s%s' % (prot.attributes['hostname'], prot.attributes['prefix'], replica['name']) # logging.debug('Light Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s', worker_number, total_workers, replica['scope'], replica['name'], pfn, rse) start = time.time() prot.delete(pfn) duration = time.time() - start logger( logging.INFO, 'Deletion SUCCESS of %s:%s as %s on %s in %s seconds', replica['scope'], replica['name'], pfn, rse, duration) payload = { 'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse, 'rse_id': rse_id, 'file-size': replica.get('bytes') or 0, 'bytes': replica.get('bytes') or 0, 'url': pfn, 'duration': duration, 'protocol': prot.attributes['scheme'] } if replica['scope'].vo != 'def': payload['vo'] = replica['scope'].vo add_message('deletion-done', payload) deleted_replicas.append(replica) except SourceNotFound: err_msg = 'Deletion NOTFOUND of %s:%s as %s on %s' % ( replica['scope'], replica['name'], pfn, rse) logger(logging.WARNING, err_msg) deleted_replicas.append(replica) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: err_msg = 'Deletion NOACCESS of %s:%s as %s on %s: %s' % ( replica['scope'], replica['name'], pfn, rse, str(error)) logger(logging.WARNING, err_msg) payload = { 'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse, 'rse_id': rse_id, 'file-size': replica['bytes'] or 0, 'bytes': replica['bytes'] or 0, 'url': pfn, 'reason': str(error), 'protocol': prot.attributes['scheme'] } if replica['scope'].vo != 'def': payload['vo'] = replica['scope'].vo add_message('deletion-failed', payload) except: logger(logging.CRITICAL, traceback.format_exc()) finally: prot.close() delete_temporary_dids(dids=deleted_replicas) if once: break if once: break if nothing_to_do: logger(logging.INFO, 'Nothing to do. I will sleep for 60s') daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) except DatabaseException as error: logging.warning('Reaper: %s', str(error)) except: logging.critical(traceback.format_exc()) die(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) logging.info('Graceful stop requested') logging.info('Graceful stop done') return
def reaper(rses, worker_number=0, total_workers=1, chunk_size=100, once=False, scheme=None, sleep_time=60): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. :param worker_number: The worker number. :param total_workers: The total number of workers. :param chunk_size: the size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param scheme: Force the reaper to use a particular protocol, e.g., mock. :param sleep_time: Thread sleep time after each chunk of work. """ logging.info('Starting Dark Reaper %s-%s: Will work on RSEs: %s', worker_number, total_workers, ', '.join(rses)) pid = os.getpid() thread = threading.current_thread() hostname = socket.gethostname() executable = ' '.join(sys.argv) hash_executable = hashlib.sha256( (sys.argv[0] + ''.join(rses)).encode()).hexdigest() sanity_check(executable=None, hostname=hostname) while not GRACEFUL_STOP.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) logging.info( 'Dark Reaper({0[worker_number]}/{0[total_workers]}): Live gives {0[heartbeat]}' .format(locals())) nothing_to_do = True start_time = time.time() rses_to_process = list(set(rses) & set(list_rses())) random.shuffle(rses_to_process) for rse_id in rses_to_process: replicas = list_quarantined_replicas( rse_id=rse_id, limit=chunk_size, worker_number=worker_number, total_workers=total_workers) rse_info = rsemgr.get_rse_info(rse_id=rse_id) rse = rse_info['rse'] prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme) deleted_replicas = [] try: prot.connect() for replica in replicas: nothing_to_do = False scope = '' if replica['scope']: scope = replica['scope'].external try: pfn = str( list( rsemgr.lfns2pfns( rse_settings=rse_info, lfns=[{ 'scope': scope, 'name': replica['name'], 'path': replica['path'] }], operation='delete', scheme=scheme).values())[0]) logging.info( 'Dark Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s', worker_number, total_workers, scope, replica['name'], pfn, rse) start = time.time() prot.delete(pfn) duration = time.time() - start logging.info( 'Dark Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s in %s seconds', worker_number, total_workers, scope, replica['name'], pfn, rse, duration) payload = { 'scope': scope, 'name': replica['name'], 'rse': rse, 'rse_id': rse_id, 'file-size': replica.get('bytes') or 0, 'bytes': replica.get('bytes') or 0, 'url': pfn, 'duration': duration, 'protocol': prot.attributes['scheme'] } if replica['scope'].vo != 'def': payload['vo'] = replica['scope'].vo add_message('deletion-done', payload) deleted_replicas.append(replica) except SourceNotFound: err_msg = ( 'Dark Reaper %s-%s: Deletion NOTFOUND of %s:%s as %s on %s' % (worker_number, total_workers, scope, replica['name'], pfn, rse)) logging.warning(err_msg) deleted_replicas.append(replica) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: err_msg = ( 'Dark Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, total_workers, scope, replica['name'], pfn, rse, str(error))) logging.warning(err_msg) payload = { 'scope': scope, 'name': replica['name'], 'rse': rse, 'rse_id': rse_id, 'file-size': replica['bytes'] or 0, 'bytes': replica['bytes'] or 0, 'url': pfn, 'reason': str(error), 'protocol': prot.attributes['scheme'] } if replica['scope'].vo != 'def': payload['vo'] = replica['scope'].vo add_message('deletion-failed', payload) except Exception: logging.critical(traceback.format_exc()) finally: prot.close() delete_quarantined_replicas(rse_id=rse_id, replicas=deleted_replicas) if once: break if once: break if nothing_to_do: logging.info('Dark Reaper %s-%s: Nothing to do', worker_number, total_workers) daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) except DatabaseException as error: logging.warning('Reaper: %s', str(error)) except Exception: logging.critical(traceback.format_exc()) die(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) logging.info('Graceful stop requested') logging.info('Graceful stop done') return
def declare_suspicious_replicas_bad(once=False, younger_than=3, nattempts=10, rse_expression='MOCK', vos=None, max_replicas_per_rse=100, sleep_time=60): """ Main loop to check for available replicas which are labeled as suspicious Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table and available on other RSE. Finds surls of these replicas and declares them as bad. :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped. :param younger_than: The number of days since which bad_replicas table will be searched for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'), but 'AVAILABLE' on other RSE(s). :param nattempts: The minimum number of appearances in the bad_replica DB table in order to appear in the resulting list of replicas for recovery. :param rse_expression: Search for suspicious replicas on RSEs matching the 'rse_expression'. :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. If None, we either use all VOs if run from "def", :param max_replicas_per_rse: Maximum number of replicas which are allowed to be labeled as bad per RSE. If more is found, processing is skipped and warning is printed. :param sleep_time: Thread sleep time after each chunk of work. :returns: None """ # assembling the worker name identifier ('executable') including the rses from <rse_expression> # in order to have the possibility to detect a start of a second instance with the same set of RSES executable = argv[0] multi_vo = config_get_bool('common', 'multi_vo', raise_exception=False, default=False) if not multi_vo: if vos: logging.warning( 'Ignoring argument vos, this is only applicable in a multi-VO setup.' ) vos = ['def'] else: if vos: invalid = set(vos) - set([v['vo'] for v in list_vos()]) if invalid: msg = 'VO{} {} cannot be found'.format( 's' if len(invalid) > 1 else '', ', '.join([repr(v) for v in invalid])) raise VONotFound(msg) else: vos = [v['vo'] for v in list_vos()] logging.info('replica_recoverer: This instance will work on VO%s: %s' % ('s' if len(vos) > 1 else '', ', '.join([v for v in vos]))) # Don't require a result from the expression at each VO, only raise if we can't get a result from any of them rses = [] exceptions_raised = 0 for vo in vos: try: parsed_rses = parse_expression(expression=rse_expression, filter_={'vo': vo}) except InvalidRSEExpression: exceptions_raised += 1 parsed_rses = [] for rse in parsed_rses: rses.append(rse['id']) if exceptions_raised == len(vos): raise InvalidRSEExpression('RSE Expression resulted in an empty set.') rses.sort() executable += ' --rse-expression ' + str(rses) sanity_check(executable=executable, hostname=socket.gethostname()) # make an initial heartbeat - expected only one replica-recoverer thread on one node # heartbeat mechanism is used in this daemon only for information purposes # (due to expected low load, the actual DB query does not filter the result based on worker number) live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) # wait a moment in case all workers started at the same time GRACEFUL_STOP.wait(1) while not GRACEFUL_STOP.is_set(): try: # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon) heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) total_workers = heartbeat['nr_threads'] worker_number = heartbeat['assign_thread'] # there is only 1 worker allowed for this daemon if total_workers != 1: logging.error( 'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.', socket.gethostname()) die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) break start = time.time() logging.info( 'replica_recoverer[%i/%i]: ready to query replicas at RSE %s,' + ' reported suspicious in the last %i days at least %i times which are available on other RSEs.', # NOQA: W503 worker_number, total_workers, rse_expression, younger_than, nattempts) getfileskwargs = { 'younger_than': younger_than, 'nattempts': nattempts, 'exclude_states': ['B', 'R', 'D', 'L', 'T'], 'available_elsewhere': True, 'is_suspicious': True } # Don't require a result from the expression at each VO, only raise if we can't get a result from any of them recoverable_replicas = [] exceptions_raised = 0 for vo in vos: try: recoverable_replicas.extend( get_suspicious_files(rse_expression, filter_={'vo': vo}, **getfileskwargs)) except InvalidRSEExpression: exceptions_raised += 1 if exceptions_raised == len(vos): raise InvalidRSEExpression( 'RSE Expression resulted in an empty set.') logging.info( 'replica_recoverer[%i/%i]: suspicious replica query took %.2f seconds, total of %i replicas were found.', worker_number, total_workers, time.time() - start, len(recoverable_replicas)) if not recoverable_replicas and not once: logging.info( 'replica_recoverer[%i/%i]: found %i recoverable suspicious replicas.', worker_number, total_workers, len(recoverable_replicas)) daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) else: logging.info( 'replica_recoverer[%i/%i]: looking for replica surls.', worker_number, total_workers) start = time.time() surls_to_recover = { } # dictionary of { vo1: {rse1: [surl1, surl2, ... ], rse2: ...}, vo2:... } cnt_surl_not_found = 0 for replica in recoverable_replicas: scope = replica['scope'] name = replica['name'] vo = scope.vo rse = replica['rse'] rse_id = replica['rse_id'] if GRACEFUL_STOP.is_set(): break if vo not in surls_to_recover: surls_to_recover[vo] = {} if rse_id not in surls_to_recover[vo]: surls_to_recover[vo][rse_id] = [] # for each suspicious replica, we get its surl through the list_replicas function surl_not_found = True for rep in list_replicas([{'scope': scope, 'name': name}]): for site in rep['rses']: if site == rse_id: surls_to_recover[vo][rse_id].append( rep['rses'][site][0]) surl_not_found = False if surl_not_found: cnt_surl_not_found += 1 logging.warning( 'replica_recoverer[%i/%i]: skipping suspicious replica %s on %s, no surls were found.', worker_number, total_workers, name, rse) logging.info( 'replica_recoverer[%i/%i]: found %i/%i surls (took %.2f seconds), declaring them as bad replicas now.', worker_number, total_workers, len(recoverable_replicas) - cnt_surl_not_found, len(recoverable_replicas), time.time() - start) for vo in surls_to_recover: for rse_id in surls_to_recover[vo]: logging.info( 'replica_recoverer[%i/%i]: ready to declare %i bad replica(s) on %s: %s.', worker_number, total_workers, len(surls_to_recover[vo][rse_id]), rse, str(surls_to_recover[vo][rse_id])) if len(surls_to_recover[vo] [rse_id]) > max_replicas_per_rse: logging.warning( 'replica_recoverer[%i/%i]: encountered more than %i suspicious replicas (%s) on %s. Please investigate.', worker_number, total_workers, max_replicas_per_rse, str(len(surls_to_recover[vo][rse_id])), rse) else: declare_bad_file_replicas( pfns=surls_to_recover[vo][rse_id], reason='Suspicious. Automatic recovery.', issuer=InternalAccount('root', vo=vo), status=BadFilesStatus.BAD, session=None) logging.info( 'replica_recoverer[%i/%i]: finished declaring bad replicas on %s.', worker_number, total_workers, rse) except (DatabaseException, DatabaseError) as err: if match('.*QueuePool.*', str(err.args[0])): logging.warning(traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) elif match('.*ORA-03135.*', str(err.args[0])): logging.warning(traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) else: logging.critical(traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) except Exception as err: logging.critical(traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) if once: break die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) logging.info('replica_recoverer[%i/%i]: graceful stop done', worker_number, total_workers)
def deliver_messages(once=False, brokers_resolved=None, thread=0, bulk=1000, delay=10, broker_timeout=3, broker_retry=3, sleep_time=10): ''' Main loop to deliver messages to a broker. ''' logging.info('[broker] starting - threads (%i) bulk (%i)', thread, bulk) if sleep_time == deliver_messages.__defaults__[ 7] and delay != deliver_messages.__defaults__[4]: sleep_time = delay if not brokers_resolved: logging.fatal('No brokers resolved.') return if not broker_timeout: # Allow zero in config broker_timeout = None logging.info('[broker] checking authentication method') use_ssl = True try: use_ssl = config_get_bool('messaging-hermes', 'use_ssl') except: logging.info( '[broker] could not find use_ssl in configuration -- please update your rucio.cfg' ) port = config_get_int('messaging-hermes', 'port') vhost = config_get('messaging-hermes', 'broker_virtual_host', raise_exception=False) if not use_ssl: username = config_get('messaging-hermes', 'username') password = config_get('messaging-hermes', 'password') port = config_get_int('messaging-hermes', 'nonssl_port') conns = [] for broker in brokers_resolved: if not use_ssl: logging.info( '[broker] setting up username/password authentication: %s' % broker) con = stomp.Connection12(host_and_ports=[(broker, port)], vhost=vhost, keepalive=True, timeout=broker_timeout) else: logging.info( '[broker] setting up ssl cert/key authentication: %s' % broker) con = stomp.Connection12( host_and_ports=[(broker, port)], use_ssl=True, ssl_key_file=config_get('messaging-hermes', 'ssl_key_file'), ssl_cert_file=config_get('messaging-hermes', 'ssl_cert_file'), vhost=vhost, keepalive=True, timeout=broker_timeout) con.set_listener( 'rucio-hermes', HermesListener(con.transport._Transport__host_and_ports[0])) conns.append(con) destination = config_get('messaging-hermes', 'destination') executable = 'hermes [broker]' hostname = socket.getfqdn() pid = os.getpid() heartbeat_thread = threading.current_thread() # Make an initial heartbeat so that all daemons have the correct worker number on the next try sanity_check(executable=executable, hostname=hostname, pid=pid, thread=heartbeat_thread) GRACEFUL_STOP.wait(1) while not GRACEFUL_STOP.is_set(): try: t_start = time.time() heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=heartbeat_thread) logging.debug('[broker] %i:%i - using: %s', heartbeat['assign_thread'], heartbeat['nr_threads'], [ conn.transport._Transport__host_and_ports[0][0] for conn in conns ]) messages = retrieve_messages(bulk=bulk, thread=heartbeat['assign_thread'], total_threads=heartbeat['nr_threads']) if messages: logging.debug('[broker] %i:%i - retrieved %i messages', heartbeat['assign_thread'], heartbeat['nr_threads'], len(messages)) to_delete = [] for message in messages: try: conn = random.sample(conns, 1)[0] if not conn.is_connected(): host_and_ports = conn.transport._Transport__host_and_ports[ 0][0] record_counter('daemons.hermes.reconnect.%s' % host_and_ports.split('.')[0]) labels = {'host': host_and_ports.split('.')[0]} RECONNECT_COUNTER.labels(**labels).inc() if not use_ssl: logging.info( '[broker] %i:%i - connecting with USERPASS to %s', heartbeat['assign_thread'], heartbeat['nr_threads'], host_and_ports) conn.connect(username, password, wait=True) else: logging.info( '[broker] %i:%i - connecting with SSL to %s', heartbeat['assign_thread'], heartbeat['nr_threads'], host_and_ports) conn.connect(wait=True) conn.send(body=json.dumps({ 'event_type': str(message['event_type']).lower(), 'payload': message['payload'], 'created_at': str(message['created_at']) }), destination=destination, headers={ 'persistent': 'true', 'event_type': str(message['event_type']).lower() }) to_delete.append({ 'id': message['id'], 'created_at': message['created_at'], 'updated_at': message['created_at'], 'payload': json.dumps(message['payload']), 'event_type': message['event_type'] }) except ValueError: logging.warning('Cannot serialize payload to JSON: %s', str(message['payload'])) to_delete.append({ 'id': message['id'], 'created_at': message['created_at'], 'updated_at': message['created_at'], 'payload': str(message['payload']), 'event_type': message['event_type'] }) continue except stomp.exception.NotConnectedException as error: logging.warning( 'Could not deliver message due to NotConnectedException: %s', str(error)) continue except stomp.exception.ConnectFailedException as error: logging.warning( 'Could not deliver message due to ConnectFailedException: %s', str(error)) continue except Exception as error: logging.warning('Could not deliver message: %s', str(error)) logging.critical(traceback.format_exc()) continue if str(message['event_type']).lower().startswith( 'transfer') or str(message['event_type']).lower( ).startswith('stagein'): logging.debug( '[broker] %i:%i - event_type: %s, scope: %s, name: %s, rse: %s, request-id: %s, transfer-id: %s, created_at: %s', heartbeat['assign_thread'], heartbeat['nr_threads'], str(message['event_type']).lower(), message['payload'].get('scope', None), message['payload'].get('name', None), message['payload'].get('dst-rse', None), message['payload'].get('request-id', None), message['payload'].get('transfer-id', None), str(message['created_at'])) elif str(message['event_type']).lower().startswith( 'dataset'): logging.debug( '[broker] %i:%i - event_type: %s, scope: %s, name: %s, rse: %s, rule-id: %s, created_at: %s)', heartbeat['assign_thread'], heartbeat['nr_threads'], str(message['event_type']).lower(), message['payload']['scope'], message['payload']['name'], message['payload']['rse'], message['payload']['rule_id'], str(message['created_at'])) elif str(message['event_type']).lower().startswith( 'deletion'): if 'url' not in message['payload']: message['payload']['url'] = 'unknown' logging.debug( '[broker] %i:%i - event_type: %s, scope: %s, name: %s, rse: %s, url: %s, created_at: %s)', heartbeat['assign_thread'], heartbeat['nr_threads'], str(message['event_type']).lower(), message['payload']['scope'], message['payload']['name'], message['payload']['rse'], message['payload']['url'], str(message['created_at'])) else: logging.debug('[broker] %i:%i - other message: %s', heartbeat['assign_thread'], heartbeat['nr_threads'], message) delete_messages(to_delete) logging.info('[broker] %i:%i - submitted %i messages', heartbeat['assign_thread'], heartbeat['nr_threads'], len(to_delete)) if once: break except NoResultFound: # silence this error: https://its.cern.ch/jira/browse/RUCIO-1699 pass except: logging.critical(traceback.format_exc()) daemon_sleep(start_time=t_start, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) for conn in conns: try: conn.disconnect() except Exception: pass logging.debug('[broker] %i:%i - graceful stop requested', heartbeat['assign_thread'], heartbeat['nr_threads']) die(executable, hostname, pid, heartbeat_thread) logging.debug('[broker] %i:%i - graceful stop done', heartbeat['assign_thread'], heartbeat['nr_threads'])
def hermes2(once=False, thread=0, bulk=1000, sleep_time=10): """ Creates a Hermes2 Worker that can submit messages to different services (InfluXDB, ElasticSearch, ActiveMQ) The list of services need to be define in the config service in the hermes section. The list of endpoints need to be defined in rucio.cfg in the hermes section. :param once: Run only once. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param sleep_time: Time between two cycles. """ executable = 'hermes2' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname, pid=pid, thread=hb_thread) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) # Make an initial heartbeat so that all daemons have the correct worker number on the next try GRACEFUL_STOP.wait(10) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'hermes2[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') try: services_list = get('hermes', 'services_list') services_list = services_list.split(',') except ConfigNotFound: logger(logging.DEBUG, 'No services found, exiting') sys.exit(1) if 'influx' in services_list: try: influx_endpoint = config_get('hermes', 'influxdb_endpoint', False, None) if not influx_endpoint: logger( logging.ERROR, 'InfluxDB defined in the services list, but no endpoint can be find. Exiting' ) sys.exit(1) except Exception as err: logger(logging.ERROR, str(err)) if 'elastic' in services_list: try: elastic_endpoint = config_get('hermes', 'elastic_endpoint', False, None) if not elastic_endpoint: logger( logging.ERROR, 'Elastic defined in the services list, but no endpoint can be find. Exiting' ) sys.exit(1) except Exception as err: logger(logging.ERROR, str(err)) if 'activemq' in services_list: try: # activemq_endpoint = config_get('hermes', 'activemq_endpoint', False, None) conns, destination, username, password, use_ssl = setup_activemq( logger) if not conns: logger( logging.ERROR, 'ActiveMQ defined in the services list, cannot be setup') sys.exit(1) except Exception as err: logger(logging.ERROR, str(err)) while not GRACEFUL_STOP.is_set(): message_status = copy.deepcopy(services_list) message_statuses = {} stime = time.time() try: start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'hermes2[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') messages = retrieve_messages( bulk=bulk, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads']) if messages: for message in messages: message_statuses[message['id']] = copy.deepcopy( services_list) logger(logging.DEBUG, 'Retrieved %i messages retrieved in %s seconds', len(messages), time.time() - start_time) if 'influx' in message_status: t_time = time.time() logger(logging.DEBUG, 'Will submit to influxDB') try: state = aggregate_to_influx(messages=messages, bin_size='1m', endpoint=influx_endpoint, logger=logger) except Exception as error: logger(logging.ERROR, 'Error sending to InfluxDB : %s', str(error)) state = 500 if state in [204, 200]: logger( logging.INFO, 'Messages successfully submitted to influxDB in %s seconds', time.time() - t_time) for message in messages: message_statuses[message['id']].remove('influx') else: logger(logging.INFO, 'Failure to submit to influxDB') if 'elastic' in message_status: t_time = time.time() try: state = submit_to_elastic(messages=messages, endpoint=elastic_endpoint, logger=logger) except Exception as error: logger(logging.ERROR, 'Error sending to Elastic : %s', str(error)) state = 500 if state in [200, 204]: logger( logging.INFO, 'Messages successfully submitted to elastic in %s seconds', time.time() - t_time) for message in messages: message_statuses[message['id']].remove('elastic') else: logger(logging.INFO, 'Failure to submit to elastic') if 'emails' in message_status: t_time = time.time() try: to_delete = deliver_emails(messages=messages, logger=logger) logger( logging.INFO, 'Messages successfully submitted by emails in %s seconds', time.time() - t_time) for message_id in to_delete: message_statuses[message_id].remove('emails') except Exception as error: logger(logging.ERROR, 'Error sending email : %s', str(error)) if 'activemq' in message_status: t_time = time.time() try: to_delete = deliver_to_activemq( messages=messages, conns=conns, destination=destination, username=username, password=password, use_ssl=use_ssl, logger=logger) logger( logging.INFO, 'Messages successfully submitted to ActiveMQ in %s seconds', time.time() - t_time) for message_id in to_delete: message_statuses[message_id].remove('activemq') except Exception as error: logger(logging.ERROR, 'Error sending to ActiveMQ : %s', str(error)) to_delete = [] to_update = {} for message in messages: status = message_statuses[message['id']] if not status: to_delete.append({ 'id': message['id'], 'created_at': message['created_at'], 'updated_at': message['created_at'], 'payload': str(message['payload']), 'event_type': message['event_type'] }) else: status = ",".join(status) if status not in to_update: to_update[status] = [] to_update[status].append({ 'id': message['id'], 'created_at': message['created_at'], 'updated_at': message['created_at'], 'payload': str(message['payload']), 'event_type': message['event_type'] }) logger(logging.INFO, 'Deleting %s messages', len(to_delete)) delete_messages(messages=to_delete) for status in to_update: logger( logging.INFO, 'Failure to submit %s messages to %s. Will update the message status', str(len(to_update[status])), status) update_messages_services(messages=to_update[status], services=status) if once: break daemon_sleep(start_time=stime, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP, logger=logger) except Exception: logger(logging.ERROR, "Failed to submit messages", exc_info=True)
def deliver_emails(once=False, send_email=True, thread=0, bulk=1000, delay=10, sleep_time=10): ''' Main loop to deliver emails via SMTP. ''' logging.info('[email] starting - threads (%i) bulk (%i)', thread, bulk) if sleep_time == deliver_emails.__defaults__[ 5] and delay != deliver_emails.__defaults__[4]: sleep_time = delay executable = 'hermes [email]' hostname = socket.getfqdn() pid = os.getpid() heartbeat_thread = threading.current_thread() sanity_check(executable=executable, hostname=hostname) # Make an initial heartbeat so that all daemons have the correct worker number on the next try live(executable=executable, hostname=hostname, pid=pid, thread=heartbeat_thread) GRACEFUL_STOP.wait(1) email_from = config_get('messaging-hermes', 'email_from') while not GRACEFUL_STOP.is_set(): heartbeat = live(executable, hostname, pid, heartbeat_thread) logging.debug('[email] %i:%i - bulk %i', heartbeat['assign_thread'], heartbeat['nr_threads'], bulk) t_start = time.time() messages = retrieve_messages(bulk=bulk, thread=heartbeat['assign_thread'], total_threads=heartbeat['nr_threads'], event_type='email') if messages != []: to_delete = [] for message in messages: logging.debug('[email] %i:%i - submitting: %s', heartbeat['assign_thread'], heartbeat['nr_threads'], str(message)) if PY2: msg = MIMEText(message['payload']['body'].encode('utf-8')) else: msg = MIMEText(message['payload']['body']) msg['From'] = email_from msg['To'] = ', '.join(message['payload']['to']) if PY2: msg['Subject'] = message['payload']['subject'].encode( 'utf-8') else: msg['Subject'] = message['payload']['subject'] if send_email: smtp = smtplib.SMTP() smtp.connect() smtp.sendmail(msg['From'], message['payload']['to'], msg.as_string()) smtp.quit() to_delete.append({ 'id': message['id'], 'created_at': message['created_at'], 'updated_at': message['created_at'], 'payload': str(message['payload']), 'event_type': 'email' }) logging.debug('[email] %i:%i - submitting done: %s', heartbeat['assign_thread'], heartbeat['nr_threads'], str(message['id'])) delete_messages(to_delete) logging.info('[email] %i:%i - submitted %i messages', heartbeat['assign_thread'], heartbeat['nr_threads'], len(to_delete)) if once: break daemon_sleep(start_time=t_start, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) logging.debug('[email] %i:%i - graceful stop requested', heartbeat['assign_thread'], heartbeat['nr_threads']) die(executable, hostname, pid, heartbeat_thread) logging.debug('[email] %i:%i - graceful stop done', heartbeat['assign_thread'], heartbeat['nr_threads'])
def automatix(sites, inputfile, sleep_time, account, worker_number=1, total_workers=1, scope='tests', once=False, dataset_lifetime=None, set_metadata=False): sleep(sleep_time * (total_workers - worker_number) / total_workers) executable = 'automatix' hostname = socket.getfqdn() pid = getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) prefix = 'automatix[%i/%i] : ' % (worker_number, total_workers) logger = formatted_logger(logging.log, prefix + '%s') while not GRACEFUL_STOP.is_set(): heartbeat.live(executable, hostname, pid, hb_thread) starttime = time() prefix = 'automatix[%i/%i] : ' % (worker_number, total_workers) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Getting data distribution') probabilities, data = get_data_distribution(inputfile) logger(logging.DEBUG, 'Probabilities %s', probabilities) totretries = 3 status = False for site in sites: for retry in range(0, totretries): start_time = time() tmpdir = tempfile.mkdtemp() logger(logging.INFO, 'Running on site %s', site) dic = choose_element(probabilities, data) metadata = dic['metadata'] try: nbfiles = dic['nbfiles'] except KeyError: nbfiles = 2 logger(logging.WARNING, 'No nbfiles defined in the configuration, will use 2') try: filesize = dic['filesize'] except KeyError: filesize = 1000000 logger(logging.WARNING, 'No filesize defined in the configuration, will use 1M files') dsn = generate_didname(metadata, None, 'dataset') fnames = [] lfns = [] physical_fnames = [] for _ in range(nbfiles): fname = generate_didname(metadata=metadata, dsn=dsn, did_type='file') lfns.append(fname) logger(logging.INFO, 'Generating file %s in dataset %s', fname, dsn) physical_fname = '%s/%s' % (tmpdir, "".join(fname.split('/'))) physical_fnames.append(physical_fname) generate_file(physical_fname, filesize, logger=logger) fnames.append(fname) logger(logging.INFO, 'Upload %s to %s', dsn, site) dsn = '%s:%s' % (scope, dsn) status = upload(files=lfns, scope=scope, metadata=metadata, rse=site, account=account, source_dir=tmpdir, dataset_lifetime=dataset_lifetime, did=dsn, set_metadata=set_metadata, logger=logger) for physical_fname in physical_fnames: remove(physical_fname) rmdir(tmpdir) if status: monitor.record_counter(name='automatix.addnewdataset.done', delta=1) monitor.record_counter(name='automatix.addnewfile.done', delta=nbfiles) monitor.record_timer('automatix.datasetinjection', (time() - start_time) * 1000) break else: logger(logging.INFO, 'Failed to upload files. Will retry another time (attempt %s/%s)', str(retry + 1), str(totretries)) if once is True: logger(logging.INFO, 'Run with once mode. Exiting') break tottime = time() - starttime if status: logger(logging.INFO, 'It took %s seconds to upload one dataset on %s', str(tottime), str(sites)) daemon_sleep(start_time=starttime, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP, logger=logger) else: logger(logging.INFO, 'Retrying a new upload') heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop requested') logger(logging.INFO, 'Graceful stop done')
def actions_loop(once, scope, rses, sleep_time, dark_min_age, dark_threshold_percent, miss_threshold_percent, force_proceed, scanner_files_path): """ Main loop to apply the CC actions """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() executable = 'storage-consistency-actions' heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread) # Make an initial heartbeat # so that all storage-consistency-actions have the correct worker number on the next try prefix = 'storage-consistency-actions[%i/%i] ' %\ (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger( logging.INFO, 'hostname: %s pid: %d current_thread: %s' % (hostname, pid, current_thread)) graceful_stop.wait(1) while not graceful_stop.is_set(): try: heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread) logger(logging.INFO, 'heartbeat? %s' % heartbeat) prefix = 'storage-consistency-actions[%i/%i] ' %\ (heartbeat['assign_thread'], heartbeat['nr_threads']) logger(logging.INFO, 'prefix: %s' % prefix) start = time.time() logger(logging.DEBUG, 'Start time: %f' % start) deckard_loop(scope, rses, dark_min_age, dark_threshold_percent, miss_threshold_percent, force_proceed, scanner_files_path) daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=graceful_stop, logger=logger) except Exception as e: traceback.print_exc() logger(logging.WARNING, '\n Something went wrong here... %s' % e) logger( logging.WARNING, '\n Something went wrong here... %s ' % (e.__class__.__name__)) if once: break die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
def reaper(rses, include_rses, exclude_rses, vos=None, chunk_size=100, once=False, greedy=False, scheme=None, delay_seconds=0, sleep_time=60, auto_exclude_threshold=100, auto_exclude_timeout=600): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param include_rses: RSE expression to include RSEs. :param exclude_rses: RSE expression to exclude RSEs from the Reaper. :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. If None, we either use all VOs if run from "def", or the current VO otherwise. :param chunk_size: The size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param greedy: If True, delete right away replicas with tombstone. :param scheme: Force the reaper to use a particular protocol, e.g., mock. :param delay_seconds: The delay to query replicas in BEING_DELETED state. :param sleep_time: Time between two cycles. :param auto_exclude_threshold: Number of service unavailable exceptions after which the RSE gets temporarily excluded. :param auto_exclude_timeout: Timeout for temporarily excluded RSEs. """ hostname = socket.getfqdn() executable = 'reaper' pid = os.getpid() hb_thread = threading.current_thread() sanity_check(executable=executable, hostname=hostname) heart_beat = live(executable, hostname, pid, hb_thread) prepend_str = 'reaper[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.INFO, 'Reaper starting') if not once: GRACEFUL_STOP.wait( 10 ) # To prevent running on the same partition if all the reapers restart at the same time heart_beat = live(executable, hostname, pid, hb_thread) prepend_str = 'reaper[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.INFO, 'Reaper started') while not GRACEFUL_STOP.is_set(): # try to get auto exclude parameters from the config table. Otherwise use CLI parameters. try: auto_exclude_threshold = config_get('reaper', 'auto_exclude_threshold', default=auto_exclude_threshold) auto_exclude_timeout = config_get('reaper', 'auto_exclude_timeout', default=auto_exclude_timeout) except (NoOptionError, NoSectionError, RuntimeError): pass # Check if there is a Judge Evaluator backlog try: max_evaluator_backlog_count = config_get( 'reaper', 'max_evaluator_backlog_count') except (NoOptionError, NoSectionError, RuntimeError): max_evaluator_backlog_count = None try: max_evaluator_backlog_duration = config_get( 'reaper', 'max_evaluator_backlog_duration') except (NoOptionError, NoSectionError, RuntimeError): max_evaluator_backlog_duration = None if max_evaluator_backlog_count or max_evaluator_backlog_duration: backlog = get_evaluation_backlog() if max_evaluator_backlog_count and \ backlog[0] and \ max_evaluator_backlog_duration and \ backlog[1] and \ backlog[0] > max_evaluator_backlog_count and \ backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration): logger( logging.ERROR, 'Reaper: Judge evaluator backlog count and duration hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_count and backlog[ 0] and backlog[0] > max_evaluator_backlog_count: logger( logging.ERROR, 'Reaper: Judge evaluator backlog count hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_duration and backlog[ 1] and backlog[1] < datetime.utcnow() - timedelta( minutes=max_evaluator_backlog_duration): logger( logging.ERROR, 'Reaper: Judge evaluator backlog duration hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue rses_to_process = get_rses_to_process(rses, include_rses, exclude_rses, vos) if not rses_to_process: logger(logging.ERROR, 'Reaper: No RSEs found. Will sleep for 30 seconds') GRACEFUL_STOP.wait(30) continue start_time = time.time() try: staging_areas = [] dict_rses = {} heart_beat = live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'reaper[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') tot_needed_free_space = 0 for rse in rses_to_process: # Check if the RSE is a staging area if rse['staging_area']: staging_areas.append(rse['rse']) # Check if RSE is blocklisted if rse['availability'] % 2 == 0: logger(logging.DEBUG, 'RSE %s is blocklisted for delete', rse['rse']) continue needed_free_space, only_delete_obsolete = __check_rse_usage( rse['rse'], rse['id'], greedy=greedy, logger=logger) if needed_free_space: dict_rses[(rse['rse'], rse['id'])] = [ needed_free_space, only_delete_obsolete ] tot_needed_free_space += needed_free_space elif only_delete_obsolete: dict_rses[(rse['rse'], rse['id'])] = [ needed_free_space, only_delete_obsolete ] else: logger(logging.DEBUG, 'Nothing to delete on %s', rse['rse']) # Ordering the RSEs based on the needed free space sorted_dict_rses = OrderedDict( sorted(dict_rses.items(), key=lambda x: x[1][0], reverse=True)) logger(logging.DEBUG, 'List of RSEs to process ordered by needed space desc: %s', str(sorted_dict_rses)) # Get the mapping between the RSE and the hostname used for deletion. The dictionary has RSE as key and (hostanme, rse_info) as value rses_hostname_mapping = get_rses_to_hostname_mapping() # logger(logging.DEBUG, '%s Mapping RSEs to hostnames used for deletion : %s', prepend_str, str(rses_hostname_mapping)) list_rses_mult = [] # Loop over the RSEs. rse_key = (rse, rse_id) and fill list_rses_mult that contains all RSEs to process with different multiplicity for rse_key in dict_rses: rse_name, rse_id = rse_key # The length of the deletion queue scales inversily with the number of workers # The ceil increase the weight of the RSE with small amount of files to delete if tot_needed_free_space: max_workers = ceil(dict_rses[rse_key][0] / tot_needed_free_space * 1000 / heart_beat['nr_threads']) else: max_workers = 1 list_rses_mult.extend([ (rse_name, rse_id, dict_rses[rse_key][0], dict_rses[rse_key][1]) for _ in range(int(max_workers)) ]) random.shuffle(list_rses_mult) paused_rses = [] for rse_name, rse_id, needed_free_space, max_being_deleted_files in list_rses_mult: result = REGION.get('pause_deletion_%s' % rse_id, expiration_time=120) if result is not NO_VALUE: paused_rses.append(rse_name) logger( logging.DEBUG, 'Not enough replicas to delete on %s during the previous cycle. Deletion paused for a while', rse_name) continue result = REGION.get('temporary_exclude_%s' % rse_id, expiration_time=auto_exclude_timeout) if result is not NO_VALUE: logger( logging.WARNING, 'Too many failed attempts for %s in last cycle. RSE is temporarly excluded.', rse_name) labels = {'rse': rse_name} EXCLUDED_RSE_GAUGE.labels(**labels).set(1) continue labels = {'rse': rse_name} EXCLUDED_RSE_GAUGE.labels(**labels).set(0) percent = 0 if tot_needed_free_space: percent = needed_free_space / tot_needed_free_space * 100 logger( logging.DEBUG, 'Working on %s. Percentage of the total space needed %.2f', rse_name, percent) try: rse_hostname, rse_info = rses_hostname_mapping[rse_id] except KeyError: logger(logging.DEBUG, "Hostname lookup for %s failed.", rse_name) REGION.set('pause_deletion_%s' % rse_id, True) continue rse_hostname_key = '%s,%s' % (rse_id, rse_hostname) payload_cnt = list_payload_counts(executable, older_than=600, hash_executable=None, session=None) # logger(logging.DEBUG, '%s Payload count : %s', prepend_str, str(payload_cnt)) tot_threads_for_hostname = 0 tot_threads_for_rse = 0 for key in payload_cnt: if key and key.find(',') > -1: if key.split(',')[1] == rse_hostname: tot_threads_for_hostname += payload_cnt[key] if key.split(',')[0] == str(rse_id): tot_threads_for_rse += payload_cnt[key] max_deletion_thread = get_max_deletion_threads_by_hostname( rse_hostname) if rse_hostname_key in payload_cnt and tot_threads_for_hostname >= max_deletion_thread: logger( logging.DEBUG, 'Too many deletion threads for %s on RSE %s. Back off', rse_hostname, rse_name) # Might need to reschedule a try on this RSE later in the same cycle continue logger( logging.INFO, 'Nb workers on %s smaller than the limit (current %i vs max %i). Starting new worker on RSE %s', rse_hostname, tot_threads_for_hostname, max_deletion_thread, rse_name) live(executable, hostname, pid, hb_thread, older_than=600, hash_executable=None, payload=rse_hostname_key, session=None) logger(logging.DEBUG, 'Total deletion workers for %s : %i', rse_hostname, tot_threads_for_hostname + 1) # List and mark BEING_DELETED the files to delete del_start_time = time.time() only_delete_obsolete = dict_rses[(rse_name, rse_id)][1] try: with monitor.record_timer_block( 'reaper.list_unlocked_replicas'): if only_delete_obsolete: logger( logging.DEBUG, 'Will run list_and_mark_unlocked_replicas on %s. No space needed, will only delete EPOCH tombstoned replicas', rse_name) replicas = list_and_mark_unlocked_replicas( limit=chunk_size, bytes_=needed_free_space, rse_id=rse_id, delay_seconds=delay_seconds, only_delete_obsolete=only_delete_obsolete, session=None) logger( logging.DEBUG, 'list_and_mark_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas', rse_name, needed_free_space, time.time() - del_start_time, len(replicas)) if len(replicas) < chunk_size: logger( logging.DEBUG, 'Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle', rse_name, chunk_size, len(replicas)) REGION.set('pause_deletion_%s' % rse_id, True) except (DatabaseException, IntegrityError, DatabaseError) as error: logger(logging.ERROR, '%s', str(error)) continue except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) # Physical deletion will take place there try: prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme, logger=logger) for file_replicas in chunks(replicas, chunk_size): # Refresh heartbeat live(executable, hostname, pid, hb_thread, older_than=600, hash_executable=None, payload=rse_hostname_key, session=None) del_start_time = time.time() for replica in file_replicas: try: replica['pfn'] = str( list( rsemgr.lfns2pfns( rse_settings=rse_info, lfns=[{ 'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path'] }], operation='delete', scheme=scheme).values())[0]) except (ReplicaUnAvailable, ReplicaNotFound) as error: logger( logging.WARNING, 'Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s', replica['scope'], replica['name'], rse_name, str(error)) replica['pfn'] = None except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) deleted_files = delete_from_storage( file_replicas, prot, rse_info, staging_areas, auto_exclude_threshold, logger=logger) logger(logging.INFO, '%i files processed in %s seconds', len(file_replicas), time.time() - del_start_time) # Then finally delete the replicas del_start = time.time() with monitor.record_timer_block( 'reaper.delete_replicas'): delete_replicas(rse_id=rse_id, files=deleted_files) logger( logging.DEBUG, 'delete_replicas successed on %s : %s replicas in %s seconds', rse_name, len(deleted_files), time.time() - del_start) DELETION_COUNTER.inc(len(deleted_files)) except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) if paused_rses: logger(logging.INFO, 'Deletion paused for a while for following RSEs: %s', ', '.join(paused_rses)) if once: break daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP, logger=logger) except DatabaseException as error: logger(logging.WARNING, 'Reaper: %s', str(error)) except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) finally: if once: break die(executable=executable, hostname=hostname, pid=pid, thread=hb_thread) logger(logging.INFO, 'Graceful stop requested') logger(logging.INFO, 'Graceful stop done') return
def necromancer(thread=0, bulk=5, once=False, sleep_time=60): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Thread sleep time after each chunk of work. """ update_history_threshold = 3600 update_history_time = time.time() executable = 'necromancer' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not GRACEFUL_STOP.is_set(): heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') # Check if there is a Judge Evaluator backlog try: max_evaluator_backlog_count = config_get( 'necromancer', 'max_evaluator_backlog_count') max_evaluator_backlog_count = int(max_evaluator_backlog_count) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_evaluator_backlog_count = None try: max_evaluator_backlog_duration = config_get( 'necromancer', 'max_evaluator_backlog_duration') max_evaluator_backlog_duration = int( max_evaluator_backlog_duration) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_evaluator_backlog_duration = None if max_evaluator_backlog_count or max_evaluator_backlog_duration: backlog = get_evaluation_backlog(expiration_time=sleep_time) if max_evaluator_backlog_count and \ backlog[0] and \ max_evaluator_backlog_duration and \ backlog[1] and \ backlog[0] > max_evaluator_backlog_count and \ backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration): logger( logging.ERROR, 'Necromancer: Judge evaluator backlog count and duration hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_count and backlog[ 0] and backlog[0] > max_evaluator_backlog_count: logger( logging.ERROR, 'Necromancer: Judge evaluator backlog count hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_duration and backlog[ 1] and backlog[1] < datetime.utcnow() - timedelta( minutes=max_evaluator_backlog_duration): logger( logging.ERROR, 'Necromancer: Judge evaluator backlog duration hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue # Check how many bad replicas are queued try: max_bad_replicas_backlog_count = config_get( 'necromancer', 'max_bad_replicas_backlog_count') max_bad_replicas_backlog_count = int( max_bad_replicas_backlog_count) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_bad_replicas_backlog_count = None bad_replicas_backlog = get_bad_replicas_backlog() tot_bad_files = sum( [bad_replicas_backlog[key] for key in bad_replicas_backlog]) list_of_rses = list() # If too many replica, call list_bad_replicas with a list of RSEs if max_bad_replicas_backlog_count and tot_bad_files > max_bad_replicas_backlog_count and len( bad_replicas_backlog) > 1: logger( logging.INFO, 'Backlog of bads replica too big. Apply some sharing between different RSEs' ) rses = list() cnt = 0 for key in sorted(bad_replicas_backlog, key=bad_replicas_backlog.get, reverse=False): rses.append({'id': key}) cnt += bad_replicas_backlog[key] if cnt >= bulk: list_of_rses.append(rses) rses = list() cnt = 0 else: list_of_rses.append(None) stime = time.time() replicas = [] try: for rses in list_of_rses: replicas = list_bad_replicas( limit=bulk, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], rses=rses) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica[ 'name'], replica['rse_id'], replica['rse'] logger(logging.INFO, 'Working on %s:%s on %s' % (scope, name, rse)) list_replicas = get_replicas_state(scope=scope, name=name) if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas: logger( logging.INFO, 'File %s:%s has no other available or temporary available replicas, it will be marked as lost' % (scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter( name='necromancer.badfiles.lostfile') except DatabaseException as error: logger(logging.WARNING, str(error)) else: rep = list_replicas.get(ReplicaState.AVAILABLE, []) unavailable_rep = list_replicas.get( ReplicaState.TEMPORARY_UNAVAILABLE, []) logger( logging.INFO, 'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s' % (scope, name, str(rep), str(unavailable_rep))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter( name='necromancer.badfiles.recovering') except DatabaseException as error: logger(logging.WARNING, str(error)) logger( logging.INFO, 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas)))) except Exception: exc_type, exc_value, exc_traceback = exc_info() logger( logging.CRITICAL, ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip()) if once: break else: now = time.time() if (now - update_history_time) > update_history_threshold: logger( logging.INFO, 'Last update of history table %s seconds ago. Running update.' % (now - update_history_time)) bad_replicas = list_bad_replicas_history( limit=1000000, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads']) for rse_id in bad_replicas: chunk_size = 1000 nchunk = int(ceil(len(bad_replicas[rse_id]) / chunk_size)) logger(logging.DEBUG, 'Update history for rse_id %s' % (rse_id)) cnt = 0 for chunk in chunks(bad_replicas[rse_id], chunk_size): logger( logging.DEBUG, ' History for rse_id %s : chunk %i/%i' % (rse_id, cnt, nchunk)) cnt += 1 update_bad_replicas_history(chunk, rse_id) logger( logging.INFO, 'History table updated in %s seconds' % (time.time() - now)) update_history_time = time.time() if len(replicas) == bulk: logger( logging.INFO, 'Processed maximum number of replicas according to the bulk size. Restart immediately next cycle' ) else: daemon_sleep(start_time=stime, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) logger(logging.INFO, 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop done')