def rule_injector(once=False, sleep_time=60): """ Main loop to check for asynchronous creation of replication rules """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() paused_rules = {} # {rule_id: datetime} # Make an initial heartbeat so that all judge-inectors have the correct worker number on the next try executable = 'judge-injector' heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=2 * 60 * 60) prefix = 'judge-injector[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') graceful_stop.wait(1) while not graceful_stop.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=2 * 60 * 60) prefix = 'judge-injector[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') start = time.time() # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] rules = get_injected_rules(total_workers=heartbeat['nr_threads'], worker_number=heartbeat['assign_thread'], limit=100, blocked_rules=[key for key in paused_rules]) logger(logging.DEBUG, 'index query time %f fetch size is %d' % (time.time() - start, len(rules))) if not rules and not once: logger(logging.DEBUG, 'did not get any work (paused_rules=%s)' % str(len(paused_rules))) daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=graceful_stop, logger=logger) else: for rule in rules: rule_id = rule[0] logger(logging.INFO, 'Injecting rule %s' % rule_id) if graceful_stop.is_set(): break try: start = time.time() inject_rule(rule_id=rule_id, logger=logger) logger(logging.DEBUG, 'injection of %s took %f' % (rule_id, time.time() - start)) except (DatabaseException, DatabaseError) as e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) record_counter('rule.judge.exceptions.LocksDetected') logger(logging.WARNING, 'Locks detected for %s' % rule_id) elif match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logger(logging.ERROR, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except (RSEWriteBlocked) as e: paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) logger(logging.WARNING, 'RSEWriteBlocked for rule %s' % rule_id) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except ReplicationRuleCreationTemporaryFailed as e: paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) logger(logging.WARNING, 'ReplicationRuleCreationTemporaryFailed for rule %s' % rule_id) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except RuleNotFound: pass except InsufficientAccountLimit: # A rule with InsufficientAccountLimit on injection hangs there potentially forever # It should be marked as SUSPENDED logger(logging.INFO, 'Marking rule %s as SUSPENDED due to InsufficientAccountLimit' % rule_id) update_rule(rule_id=rule_id, options={'state': 'SUSPENDED'}) except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logger(logging.CRITICAL, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except Exception as e: logger(logging.CRITICAL, 'Exception', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) if once: break die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
def rule_injector(once=False): """ Main loop to check for asynchronous creation of replication rules """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() paused_rules = {} # {rule_id: datetime} # Make an initial heartbeat so that all judge-inectors have the correct worker number on the next try live(executable='rucio-judge-injector', hostname=hostname, pid=pid, thread=current_thread, older_than=2 * 60 * 60) graceful_stop.wait(1) while not graceful_stop.is_set(): try: # heartbeat heartbeat = live(executable='rucio-judge-injector', hostname=hostname, pid=pid, thread=current_thread, older_than=2 * 60 * 60) start = time.time() # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] rules = get_injected_rules( total_workers=heartbeat['nr_threads'] - 1, worker_number=heartbeat['assign_thread'], limit=100, blacklisted_rules=[key for key in paused_rules]) logging.debug( 'rule_injector[%s/%s] index query time %f fetch size is %d' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, time.time() - start, len(rules))) if not rules and not once: logging.debug( 'rule_injector[%s/%s] did not get any work (paused_rules=%s)' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, str(len(paused_rules)))) graceful_stop.wait(60) else: for rule in rules: rule_id = rule[0] logging.info('rule_injector[%s/%s]: Injecting rule %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id)) if graceful_stop.is_set(): break try: start = time.time() inject_rule(rule_id=rule_id) logging.debug( 'rule_injector[%s/%s]: injection of %s took %f' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id, time.time() - start)) except (DatabaseException, DatabaseError), e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow( ) + timedelta(seconds=randint(60, 600)) record_counter( 'rule.judge.exceptions.LocksDetected') logging.warning( 'rule_injector[%s/%s]: Locks detected for %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id)) elif match('.*QueuePool.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except RSEBlacklisted, e: paused_rules[rule_id] = datetime.utcnow() + timedelta( seconds=randint(60, 600)) logging.warning( 'rule_injector[%s/%s]: RSEBlacklisted for rule %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id)) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except ReplicationRuleCreationTemporaryFailed, e: paused_rules[rule_id] = datetime.utcnow() + timedelta( seconds=randint(60, 600)) logging.warning( 'rule_injector[%s/%s]: ReplicationRuleCreationTemporaryFailed for rule %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id)) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
def run_once(paused_rules, heartbeat_handler, **_kwargs): worker_number, total_workers, logger = heartbeat_handler.live() try: start = time.time() # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] rules = get_injected_rules(total_workers=total_workers, worker_number=worker_number, limit=100, blocked_rules=[key for key in paused_rules]) logger( logging.DEBUG, 'index query time %f fetch size is %d' % (time.time() - start, len(rules))) if not rules: logger( logging.DEBUG, 'did not get any work (paused_rules=%s)' % str(len(paused_rules))) return for rule in rules: _, _, logger = heartbeat_handler.live() rule_id = rule[0] logger(logging.INFO, 'Injecting rule %s' % rule_id) if graceful_stop.is_set(): break try: start = time.time() inject_rule(rule_id=rule_id, logger=logger) logger( logging.DEBUG, 'injection of %s took %f' % (rule_id, time.time() - start)) except (DatabaseException, DatabaseError) as e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow() + timedelta( seconds=randint(60, 600)) record_counter('rule.judge.exceptions.{exception}', labels={'exception': 'LocksDetected'}) logger(logging.WARNING, 'Locks detected for %s' % rule_id) elif match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) else: logger(logging.ERROR, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) except (RSEWriteBlocked) as e: paused_rules[rule_id] = datetime.utcnow() + timedelta( seconds=randint(60, 600)) logger(logging.WARNING, 'RSEWriteBlocked for rule %s' % rule_id) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) except ReplicationRuleCreationTemporaryFailed as e: paused_rules[rule_id] = datetime.utcnow() + timedelta( seconds=randint(60, 600)) logger( logging.WARNING, 'ReplicationRuleCreationTemporaryFailed for rule %s' % rule_id) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) except RuleNotFound: pass except InsufficientAccountLimit: # A rule with InsufficientAccountLimit on injection hangs there potentially forever # It should be marked as SUSPENDED logger( logging.INFO, 'Marking rule %s as SUSPENDED due to InsufficientAccountLimit' % rule_id) update_rule(rule_id=rule_id, options={'state': 'SUSPENDED'}) except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) else: logger(logging.CRITICAL, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) except Exception as e: logger(logging.CRITICAL, 'Exception', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__})