def atropos(thread, bulk, date_check, dry_run=True, grace_period=86400, once=True, unlock=False, spread_period=0, purge_replicas=False): """ Creates an Atropos Worker that gets a list of rules which have an eol_at expired and delete them. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param grace_period: The grace_period for the rules. :param once: Run only once. """ sleep_time = 60 executable = 'atropos' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) now = datetime.datetime.now() hb = heartbeat.live(executable, hostname, pid, hb_thread) time.sleep(10) hb = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'], hb['nr_threads']) logging.debug(prepend_str + 'Starting worker') summary = {} lifetime_exceptions = {} rand = random.Random(hb['assign_thread']) for excep in rucio.core.lifetime_exception.list_exceptions( exception_id=None, states=[ LifetimeExceptionsState.APPROVED, ], session=None): key = '{}:{}'.format(excep['scope'].internal, excep['name']) if key not in lifetime_exceptions: lifetime_exceptions[key] = excep['expires_at'] elif lifetime_exceptions[key] < excep['expires_at']: lifetime_exceptions[key] = excep['expires_at'] logging.debug(prepend_str + '%s active exceptions' % len(lifetime_exceptions)) if not dry_run and date_check > now: logging.error( prepend_str + 'Atropos cannot run in non-dry-run mode for date in the future') else: while not GRACEFUL_STOP.is_set(): hb = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'], hb['nr_threads']) stime = time.time() try: rules = get_rules_beyond_eol(date_check, thread, hb['nr_threads'], session=None) logging.info(prepend_str + '%s rules to process' % (len(rules))) for rule_idx, rule in enumerate(rules, start=1): did = '%s:%s' % (rule.scope, rule.name) did_key = '{}:{}'.format(rule.scope.internal, rule.name) logging.debug(prepend_str + 'Working on rule %s on DID %s on %s' % (rule.id, did, rule.rse_expression)) if (rule_idx % 1000) == 0: logging.info(prepend_str + '%s/%s rules processed' % (rule_idx, len(rules))) # We compute the expected eol_at try: rses = parse_expression(rule.rse_expression, filter={'vo': rule.account.vo}) except InvalidRSEExpression: logging.warning( prepend_str + 'Rule %s has an RSE expression that results in an empty set: %s' % (rule.id, rule.rse_expression)) continue eol_at = rucio.core.lifetime_exception.define_eol( rule.scope, rule.name, rses) if eol_at != rule.eol_at: logging.warning( prepend_str + 'The computed eol %s differs from the one recorded %s for rule %s on %s at %s' % (eol_at, rule.eol_at, rule.id, did, rule.rse_expression)) try: update_rule(rule.id, options={'eol_at': eol_at}) except RuleNotFound: logging.warning(prepend_str + 'Cannot find rule %s on DID %s' % (rule.id, did)) continue # Check the exceptions if did_key in lifetime_exceptions: if eol_at > lifetime_exceptions[did_key]: logging.info( prepend_str + 'Rule %s on DID %s on %s has longer expiration date than the one requested : %s' % (rule.id, did, rule.rse_expression, lifetime_exceptions[did_key])) else: # If eol_at < requested extension, update eol_at logging.info( prepend_str + 'Updating rule %s on DID %s on %s according to the exception till %s' % (rule.id, did, rule.rse_expression, lifetime_exceptions[did_key])) eol_at = lifetime_exceptions[did_key] try: update_rule(rule.id, options={ 'eol_at': lifetime_exceptions[did_key] }) except RuleNotFound: logging.warning( prepend_str + 'Cannot find rule %s on DID %s' % (rule.id, did)) continue # Now check that the new eol_at is expired if eol_at and eol_at < date_check: no_locks = True for lock in get_dataset_locks(rule.scope, rule.name): if lock['rule_id'] == rule[4]: no_locks = False if lock['rse_id'] not in summary: summary[lock['rse_id']] = {} if did_key not in summary[lock['rse_id']]: summary[lock['rse_id']][did_key] = { 'length': lock['length'] or 0, 'bytes': lock['bytes'] or 0 } if no_locks: logging.warning( prepend_str + 'Cannot find a lock for rule %s on DID %s' % (rule.id, did)) if not dry_run: lifetime = grace_period + rand.randrange( spread_period + 1) logging.info( prepend_str + 'Setting %s seconds lifetime for rule %s' % (lifetime, rule.id)) options = {'lifetime': lifetime} if purge_replicas: options['purge_replicas'] = True if rule.locked and unlock: logging.info(prepend_str + 'Unlocking rule %s', rule.id) options['locked'] = False try: update_rule(rule.id, options=options) except RuleNotFound: logging.warning( prepend_str + 'Cannot find rule %s on DID %s' % (rule.id, did)) continue except Exception: exc_type, exc_value, exc_traceback = exc_info() logging.critical(''.join( format_exception(exc_type, exc_value, exc_traceback)).strip()) for rse_id in summary: tot_size, tot_files, tot_datasets = 0, 0, 0 for did in summary[rse_id]: tot_datasets += 1 tot_files += summary[rse_id][did].get('length', 0) tot_size += summary[rse_id][did].get('bytes', 0) vo = get_rse_vo(rse_id=rse_id) logging.info( prepend_str + 'For RSE %s %s %s datasets will be deleted representing %s files and %s bytes' % (get_rse_name(rse_id=rse_id), '' if vo == 'def' else 'on VO ' + vo, tot_datasets, tot_files, tot_size)) if once: break else: tottime = time.time() - stime if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (str(sleep_time - tottime))) time.sleep(sleep_time - tottime) continue logging.info(prepend_str + 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop done')
def atropos(thread, bulk, date_check, dry_run=True, grace_period=86400, once=True): """ Creates an Atropos Worker that gets a list of rules which have an eol_at expired and delete them. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param grace_period: The grace_period for the rules. :param once: Run only once. """ sleep_time = 60 executable = ' '.join(argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) now = datetime.datetime.now() hb = heartbeat.live(executable, hostname, pid, hb_thread) summary = {} lifetime_exceptions = get_lifetime_exceptions() prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'] + 1, hb['nr_threads']) if not dry_run and date_check > now: logging.error( prepend_str + 'Atropos cannot run in non-dry-run mode for date in the future') else: while not graceful_stop.is_set(): hb = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'] + 1, hb['nr_threads']) stime = time.time() try: rules = get_rules_beyond_eol(date_check, thread, hb['nr_threads'] - 1) logging.info(prepend_str + '%s rules to process' % (len(rules))) rule_idx = 0 for rule in rules: rule_idx += 1 logging.debug( prepend_str + 'Working on rule %s on DID %s:%s on %s' % (rule.id, rule.scope, rule.name, rule.rse_expression)) if (rule_idx % 1000) == 0: logging.info(prepend_str + '%s/%s rules processed' % (rule_idx, len(rules))) # We compute the expended eol_at rses = parse_expression(rule.rse_expression) eol_at = define_eol(rule.scope, rule.name, rses) # Check the exceptions if rule.name in lifetime_exceptions: if rule.eol_at > lifetime_exceptions[rule.name]: logging.info( prepend_str + 'Rule %s on DID %s:%s on %s expired. Extension requested till %s' % (rule.id, rule.scope, rule.name, rule.rse_expression, lifetime_exceptions[rule.name])) else: # If eol_at < requested extension, update eol_at logging.info( prepend_str + 'Updating rule %s on DID %s:%s on %s according to the exception till %s' % (rule.id, rule.scope, rule.name, rule.rse_expression, lifetime_exceptions[rule.name])) try: update_rule(rule.id, options={ 'eol_at': lifetime_exceptions[rule.name] }) except RuleNotFound: logging.warning( prepend_str + 'Cannot find rule %s on DID %s:%s' % (rule.id, rule.scope, rule.name)) elif eol_at != rule.eol_at: logging.warning( prepend_str + 'The computed eol %s differs from the one recorded %s for rule %s on %s:%s at %s' % (eol_at, rule.eol_at, rule.id, rule.scope, rule.name, rule.rse_expression)) try: update_rule(rule.id, options={'eol_at': eol_at}) except RuleNotFound: logging.warning( prepend_str + 'Cannot find rule %s on DID %s:%s' % (rule.id, rule.scope, rule.name)) no_locks = True for lock in get_dataset_locks(rule.scope, rule.name): if lock['rule_id'] == rule[4]: no_locks = False if lock['rse'] not in summary: summary[lock['rse']] = {} if '%s:%s' % (rule.scope, rule.name ) not in summary[lock['rse']]: summary[lock['rse']]['%s:%s' % (rule.scope, rule.name)] = { 'length': lock['length'] or 0, 'bytes': lock['bytes'] or 0 } if no_locks: logging.warning( prepend_str + 'Cannot find a lock for rule %s on DID %s:%s' % (rule.id, rule.scope, rule.name)) if not dry_run: logging.info( prepend_str + 'Setting %s seconds lifetime for rule %s' % (grace_period, rule.id)) try: update_rule(rule.id, options={'lifetime': grace_period}) except RuleNotFound: logging.warning( prepend_str + 'Cannot find rule %s on DID %s:%s' % (rule.id, rule.scope, rule.name)) except Exception: exc_type, exc_value, exc_traceback = exc_info() logging.critical(''.join( format_exception(exc_type, exc_value, exc_traceback)).strip()) for rse in summary: tot_size, tot_files, tot_datasets = 0, 0, 0 for did in summary[rse]: tot_datasets += 1 tot_files += summary[rse][did].get('length', 0) tot_size += summary[rse][did].get('bytes', 0) logging.info( prepend_str + 'For RSE %s %s datasets will be deleted representing %s files and %s bytes' % (rse, tot_datasets, tot_files, tot_size)) if once: break else: tottime = time.time() - stime if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (str(sleep_time - tottime))) time.sleep(sleep_time - tottime) continue logging.info(prepend_str + 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop done')