def rule_repairer(once=False, sleep_time=60): """ Main loop to check for STUCK replication rules """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() paused_rules = {} # {rule_id: datetime} # Make an initial heartbeat so that all judge-repairers have the correct worker number on the next try executable = 'judge-repairer' live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=60 * 30) graceful_stop.wait(1) while not graceful_stop.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=60 * 30) start = time.time() # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] # Select a bunch of rules for this worker to repair rules = get_stuck_rules(total_workers=heartbeat['nr_threads'], worker_number=heartbeat['assign_thread'], delta=-1 if once else 1800, limit=100, blocked_rules=[key for key in paused_rules]) logging.debug('rule_repairer[%s/%s] index query time %f fetch size is %d' % (heartbeat['assign_thread'], heartbeat['nr_threads'], time.time() - start, len(rules))) if not rules and not once: logging.debug('rule_repairer[%s/%s] did not get any work (paused_rules=%s)' % (heartbeat['assign_thread'], heartbeat['nr_threads'], str(len(paused_rules)))) daemon_sleep(start, sleep_time, graceful_stop) else: for rule_id in rules: rule_id = rule_id[0] logging.info('rule_repairer[%s/%s]: Repairing rule %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'], rule_id)) if graceful_stop.is_set(): break try: start = time.time() repair_rule(rule_id=rule_id) logging.debug('rule_repairer[%s/%s]: repairing of %s took %f' % (heartbeat['assign_thread'], heartbeat['nr_threads'], rule_id, time.time() - start)) except (DatabaseException, DatabaseError) as e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(600, 2400)) logging.warning('rule_repairer[%s/%s]: Locks detected for %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'], rule_id)) record_counter('rule.judge.exceptions.LocksDetected') elif match('.*QueuePool.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.critical(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except Exception as e: logging.critical(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) if once: break die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
def rule_repairer(once=False, process=0, total_processes=1, thread=0, threads_per_process=1): """ Main loop to check for STUCK replication rules """ logging.info('rule_repairer: starting') logging.info('rule_repairer: started') paused_rules = {} # {rule_id: datetime} while not graceful_stop.is_set(): try: # Select a bunch of rules for this worker to repair start = time.time() rules = get_stuck_rules(total_workers=total_processes*threads_per_process-1, worker_number=process*threads_per_process+thread, delta=-1 if once else 600) logging.debug('rule_repairer index query time %f fetch size is %d' % (time.time() - start, len(rules))) # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] # Remove paused rules from result set rules = [rule for rule in rules if rule[0] not in paused_rules] if not rules and not once: logging.info('rule_repairer[%s/%s] did not get any work' % (process*threads_per_process+thread, total_processes*threads_per_process-1)) time.sleep(10) else: record_gauge('rule.judge.repairer.threads.%d' % (process*threads_per_process+thread), 1) for rule_id in rules: rule_id = rule_id[0] logging.info('rule_repairer[%s/%s]: Repairing rule %s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id)) if graceful_stop.is_set(): break try: start = time.time() repair_rule(rule_id=rule_id) logging.debug('rule_repairer[%s/%s]: repairing of %s took %f' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id, time.time() - start)) except (DatabaseException, DatabaseError), e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) logging.warning('rule_repairer[%s/%s]: Locks detected for %s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id)) record_counter('rule.judge.exceptions.LocksDetected') else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) record_gauge('rule.judge.repairer.threads.%d' % (process*threads_per_process+thread), 0) except Exception, e: record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) record_gauge('rule.judge.repairer.threads.%d' % (process*threads_per_process+thread), 0) logging.critical(traceback.format_exc()) if once: return else: time.sleep(30)