def run_once(paused_dids, did_limit, heartbeat_handler, **_kwargs): worker_number, total_workers, logger = heartbeat_handler.live() try: # heartbeat start = time.time() # NOQA # Refresh paused dids iter_paused_dids = copy.copy(paused_dids) for key in iter_paused_dids: if datetime.utcnow() > paused_dids[key]: del paused_dids[key] # Select a bunch of dids for re evaluation for this worker dids = get_updated_dids(total_workers=total_workers, worker_number=worker_number, limit=did_limit, blocked_dids=[(InternalScope(key[0], fromExternal=False), key[1]) for key in paused_dids]) logger(logging.DEBUG, 'index query time %f fetch size is %d (%d blocked)', time.time() - start, len(dids), len([(InternalScope(key[0], fromExternal=False), key[1]) for key in paused_dids])) # If the list is empty, sent the worker to sleep if not dids: logger(logging.DEBUG, 'did not get any work (paused_dids=%s)', str(len(paused_dids))) return done_dids = {} for did in dids: _, _, logger = heartbeat_handler.live() if graceful_stop.is_set(): break # Check if this did has already been operated on did_tag = '%s:%s' % (did.scope.internal, did.name) if did_tag in done_dids: if did.rule_evaluation_action in done_dids[did_tag]: logger(logging.DEBUG, 'evaluation of %s:%s already done', did.scope, did.name) delete_updated_did(id_=did.id) continue else: done_dids[did_tag] = [] # Jump paused dids if (did.scope.internal, did.name) in paused_dids: continue try: start_time = time.time() re_evaluate_did(scope=did.scope, name=did.name, rule_evaluation_action=did.rule_evaluation_action) logger(logging.DEBUG, 'evaluation of %s:%s took %f', did.scope, did.name, time.time() - start_time) delete_updated_did(id_=did.id) done_dids[did_tag].append(did.rule_evaluation_action) except DataIdentifierNotFound: delete_updated_did(id_=did.id) except (DatabaseException, DatabaseError) as e: if match('.*ORA-000(01|54).*', str(e.args[0])): paused_dids[(did.scope.internal, did.name)] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) logger(logging.WARNING, 'Locks detected for %s:%s', did.scope, did.name) record_counter('rule.judge.exceptions.{exception}', labels={'exception': 'LocksDetected'}) elif match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, traceback.format_exc()) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, traceback.format_exc()) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) else: logger(logging.ERROR, traceback.format_exc()) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) except ReplicationRuleCreationTemporaryFailed as e: record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) logger(logging.WARNING, 'Replica Creation temporary failed, retrying later for %s:%s', did.scope, did.name) except FlushError as e: record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) logger(logging.WARNING, 'Flush error for %s:%s', did.scope, did.name) except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, traceback.format_exc()) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, traceback.format_exc()) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) else: logger(logging.CRITICAL, traceback.format_exc()) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) except Exception as e: logger(logging.CRITICAL, traceback.format_exc()) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__})
def re_evaluator(once=False): """ Main loop to check the re-evaluation of dids. """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() paused_dids = {} # {(scope, name): datetime} # Make an initial heartbeat so that all judge-evaluators have the correct worker number on the next try executable = 'judge-evaluator' live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=60 * 30) graceful_stop.wait(1) while not graceful_stop.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=60 * 30) start = time.time() # NOQA # Refresh paused dids paused_dids = dict((k, v) for k, v in iteritems(paused_dids) if datetime.utcnow() < v) # Select a bunch of dids for re evaluation for this worker dids = get_updated_dids(total_workers=heartbeat['nr_threads'], worker_number=heartbeat['assign_thread'], limit=100, blacklisted_dids=[(InternalScope(key[0], fromExternal=False), key[1]) for key in paused_dids]) logging.debug('re_evaluator[%s/%s] index query time %f fetch size is %d (%d blacklisted)' % (heartbeat['assign_thread'], heartbeat['nr_threads'], time.time() - start, len(dids), len([(InternalScope(key[0], fromExternal=False), key[1]) for key in paused_dids]))) # If the list is empty, sent the worker to sleep if not dids and not once: logging.debug('re_evaluator[%s/%s] did not get any work (paused_dids=%s)' % (heartbeat['assign_thread'], heartbeat['nr_threads'], str(len(paused_dids)))) graceful_stop.wait(30) else: done_dids = {} for did in dids: if graceful_stop.is_set(): break # Check if this did has already been operated on did_tag = '%s:%s' % (did.scope.internal, did.name) if did_tag in done_dids: if did.rule_evaluation_action in done_dids[did_tag]: logging.debug('re_evaluator[%s/%s]: evaluation of %s:%s already done' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name)) delete_updated_did(id=did.id) continue else: done_dids[did_tag] = [] # Jump paused dids if (did.scope.internal, did.name) in paused_dids: continue try: start_time = time.time() re_evaluate_did(scope=did.scope, name=did.name, rule_evaluation_action=did.rule_evaluation_action) logging.debug('re_evaluator[%s/%s]: evaluation of %s:%s took %f' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name, time.time() - start_time)) delete_updated_did(id=did.id) done_dids[did_tag].append(did.rule_evaluation_action) except DataIdentifierNotFound: delete_updated_did(id=did.id) except (DatabaseException, DatabaseError) as e: if match('.*ORA-00054.*', str(e.args[0])): paused_dids[(did.scope.internal, did.name)] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) logging.warning('re_evaluator[%s/%s]: Locks detected for %s:%s' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name)) record_counter('rule.judge.exceptions.LocksDetected') elif match('.*QueuePool.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except ReplicationRuleCreationTemporaryFailed as e: record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) logging.warning('re_evaluator[%s/%s]: Replica Creation temporary failed, retrying later for %s:%s' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name)) except FlushError as e: record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) logging.warning('re_evaluator[%s/%s]: Flush error for %s:%s' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name)) except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.critical(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except Exception as e: logging.critical(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) if once: break die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
def re_evaluator(once=False, process=0, total_processes=1, thread=0, threads_per_process=1): """ Main loop to check the re-evaluation of dids. """ logging.info('re_evaluator: starting') logging.info('re_evaluator: started') paused_dids = {} # {(scope, name): datetime} while not graceful_stop.is_set(): try: # Select a bunch of dids for re evaluation for this worker start = time.time() # NOQA dids = get_updated_dids(total_workers=total_processes*threads_per_process-1, worker_number=process*threads_per_process+thread, limit=1000) logging.debug('Re-Evaluation index query time %f fetch size is %d' % (time.time() - start, len(dids))) # Refresh paused dids iter_paused_dids = deepcopy(paused_dids) for key in iter_paused_dids: if datetime.utcnow() > paused_dids[key]: del paused_dids[key] # Remove paused dids from result set dids = [did for did in dids if (did.scope, did.name) not in paused_dids] # If the list is empty, sent the worker to sleep if not dids and not once: logging.info('re_evaluator[%s/%s] did not get any work' % (process*threads_per_process+thread, total_processes*threads_per_process-1)) time.sleep(10) else: record_gauge('rule.judge.re_evaluate.threads.%d' % (process*threads_per_process+thread), 1) done_dids = {} for did in dids: if graceful_stop.is_set(): break # Try to delete all duplicate dids # delete_duplicate_updated_dids(scope=did.scope, name=did.name, rule_evaluation_action=did.rule_evaluation_action, id=did.id) # Check if this did has already been operated on if '%s:%s' % (did.scope, did.name) in done_dids: if did.rule_evaluation_action in done_dids['%s:%s' % (did.scope, did.name)]: continue else: done_dids['%s:%s' % (did.scope, did.name)] = [] done_dids['%s:%s' % (did.scope, did.name)].append(did.rule_evaluation_action) try: start_time = time.time() re_evaluate_did(scope=did.scope, name=did.name, rule_evaluation_action=did.rule_evaluation_action) logging.debug('re_evaluator[%s/%s]: evaluation of %s:%s took %f' % (process*threads_per_process+thread, total_processes*threads_per_process-1, did.scope, did.name, time.time() - start_time)) delete_updated_did(id=did.id) except DataIdentifierNotFound, e: delete_updated_did(id=did.id) except (DatabaseException, DatabaseError), e: if isinstance(e.args[0], tuple): if match('.*ORA-00054.*', e.args[0][0]): paused_dids[(did.scope, did.name)] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) logging.warning('re_evaluator[%s/%s]: Locks detected for %s:%s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, did.scope, did.name)) record_counter('rule.judge.exceptions.LocksDetected') else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except ReplicationRuleCreationTemporaryFailed, e: record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) logging.warning('re_evaluator[%s/%s]: Replica Creation temporary failed, retrying later for %s:%s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, did.scope, did.name))