Ejemplo n.º 1
0
def run_once(paused_dids, did_limit, heartbeat_handler, **_kwargs):
    worker_number, total_workers, logger = heartbeat_handler.live()

    try:
        # heartbeat
        start = time.time()  # NOQA

        # Refresh paused dids
        iter_paused_dids = copy.copy(paused_dids)
        for key in iter_paused_dids:
            if datetime.utcnow() > paused_dids[key]:
                del paused_dids[key]

        # Select a bunch of dids for re evaluation for this worker
        dids = get_updated_dids(total_workers=total_workers,
                                worker_number=worker_number,
                                limit=did_limit,
                                blocked_dids=[(InternalScope(key[0], fromExternal=False), key[1]) for key in paused_dids])
        logger(logging.DEBUG, 'index query time %f fetch size is %d (%d blocked)', time.time() - start, len(dids),
               len([(InternalScope(key[0], fromExternal=False), key[1]) for key in paused_dids]))

        # If the list is empty, sent the worker to sleep
        if not dids:
            logger(logging.DEBUG, 'did not get any work (paused_dids=%s)', str(len(paused_dids)))
            return

        done_dids = {}
        for did in dids:
            _, _, logger = heartbeat_handler.live()
            if graceful_stop.is_set():
                break

            # Check if this did has already been operated on
            did_tag = '%s:%s' % (did.scope.internal, did.name)
            if did_tag in done_dids:
                if did.rule_evaluation_action in done_dids[did_tag]:
                    logger(logging.DEBUG, 'evaluation of %s:%s already done', did.scope, did.name)
                    delete_updated_did(id_=did.id)
                    continue
            else:
                done_dids[did_tag] = []

            # Jump paused dids
            if (did.scope.internal, did.name) in paused_dids:
                continue

            try:
                start_time = time.time()
                re_evaluate_did(scope=did.scope, name=did.name, rule_evaluation_action=did.rule_evaluation_action)
                logger(logging.DEBUG, 'evaluation of %s:%s took %f', did.scope, did.name, time.time() - start_time)
                delete_updated_did(id_=did.id)
                done_dids[did_tag].append(did.rule_evaluation_action)
            except DataIdentifierNotFound:
                delete_updated_did(id_=did.id)
            except (DatabaseException, DatabaseError) as e:
                if match('.*ORA-000(01|54).*', str(e.args[0])):
                    paused_dids[(did.scope.internal, did.name)] = datetime.utcnow() + timedelta(seconds=randint(60, 600))
                    logger(logging.WARNING, 'Locks detected for %s:%s', did.scope, did.name)
                    record_counter('rule.judge.exceptions.{exception}', labels={'exception': 'LocksDetected'})
                elif match('.*QueuePool.*', str(e.args[0])):
                    logger(logging.WARNING, traceback.format_exc())
                    record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__})
                elif match('.*ORA-03135.*', str(e.args[0])):
                    logger(logging.WARNING, traceback.format_exc())
                    record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__})
                else:
                    logger(logging.ERROR, traceback.format_exc())
                    record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__})
            except ReplicationRuleCreationTemporaryFailed as e:
                record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__})
                logger(logging.WARNING, 'Replica Creation temporary failed, retrying later for %s:%s', did.scope, did.name)
            except FlushError as e:
                record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__})
                logger(logging.WARNING, 'Flush error for %s:%s', did.scope, did.name)
    except (DatabaseException, DatabaseError) as e:
        if match('.*QueuePool.*', str(e.args[0])):
            logger(logging.WARNING, traceback.format_exc())
            record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__})
        elif match('.*ORA-03135.*', str(e.args[0])):
            logger(logging.WARNING, traceback.format_exc())
            record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__})
        else:
            logger(logging.CRITICAL, traceback.format_exc())
            record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__})
    except Exception as e:
        logger(logging.CRITICAL, traceback.format_exc())
        record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__})
Ejemplo n.º 2
0
def re_evaluator(once=False):
    """
    Main loop to check the re-evaluation of dids.
    """

    hostname = socket.gethostname()
    pid = os.getpid()
    current_thread = threading.current_thread()

    paused_dids = {}  # {(scope, name): datetime}

    # Make an initial heartbeat so that all judge-evaluators have the correct worker number on the next try
    executable = 'judge-evaluator'
    live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=60 * 30)
    graceful_stop.wait(1)

    while not graceful_stop.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=60 * 30)

            start = time.time()  # NOQA

            # Refresh paused dids
            paused_dids = dict((k, v) for k, v in iteritems(paused_dids) if datetime.utcnow() < v)

            # Select a bunch of dids for re evaluation for this worker
            dids = get_updated_dids(total_workers=heartbeat['nr_threads'],
                                    worker_number=heartbeat['assign_thread'],
                                    limit=100,
                                    blacklisted_dids=[(InternalScope(key[0], fromExternal=False), key[1]) for key in paused_dids])
            logging.debug('re_evaluator[%s/%s] index query time %f fetch size is %d (%d blacklisted)' % (heartbeat['assign_thread'],
                                                                                                         heartbeat['nr_threads'],
                                                                                                         time.time() - start,
                                                                                                         len(dids),
                                                                                                         len([(InternalScope(key[0], fromExternal=False), key[1]) for key in paused_dids])))

            # If the list is empty, sent the worker to sleep
            if not dids and not once:
                logging.debug('re_evaluator[%s/%s] did not get any work (paused_dids=%s)' % (heartbeat['assign_thread'], heartbeat['nr_threads'], str(len(paused_dids))))
                graceful_stop.wait(30)
            else:
                done_dids = {}
                for did in dids:
                    if graceful_stop.is_set():
                        break

                    # Check if this did has already been operated on
                    did_tag = '%s:%s' % (did.scope.internal, did.name)
                    if did_tag in done_dids:
                        if did.rule_evaluation_action in done_dids[did_tag]:
                            logging.debug('re_evaluator[%s/%s]: evaluation of %s:%s already done' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name))
                            delete_updated_did(id=did.id)
                            continue
                    else:
                        done_dids[did_tag] = []

                    # Jump paused dids
                    if (did.scope.internal, did.name) in paused_dids:
                        continue

                    try:
                        start_time = time.time()
                        re_evaluate_did(scope=did.scope, name=did.name, rule_evaluation_action=did.rule_evaluation_action)
                        logging.debug('re_evaluator[%s/%s]: evaluation of %s:%s took %f' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name, time.time() - start_time))
                        delete_updated_did(id=did.id)
                        done_dids[did_tag].append(did.rule_evaluation_action)
                    except DataIdentifierNotFound:
                        delete_updated_did(id=did.id)
                    except (DatabaseException, DatabaseError) as e:
                        if match('.*ORA-00054.*', str(e.args[0])):
                            paused_dids[(did.scope.internal, did.name)] = datetime.utcnow() + timedelta(seconds=randint(60, 600))
                            logging.warning('re_evaluator[%s/%s]: Locks detected for %s:%s' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name))
                            record_counter('rule.judge.exceptions.LocksDetected')
                        elif match('.*QueuePool.*', str(e.args[0])):
                            logging.warning(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        elif match('.*ORA-03135.*', str(e.args[0])):
                            logging.warning(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        else:
                            logging.error(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                    except ReplicationRuleCreationTemporaryFailed as e:
                        record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        logging.warning('re_evaluator[%s/%s]: Replica Creation temporary failed, retrying later for %s:%s' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name))
                    except FlushError as e:
                        record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        logging.warning('re_evaluator[%s/%s]: Flush error for %s:%s' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name))
        except (DatabaseException, DatabaseError) as e:
            if match('.*QueuePool.*', str(e.args[0])):
                logging.warning(traceback.format_exc())
                record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
            elif match('.*ORA-03135.*', str(e.args[0])):
                logging.warning(traceback.format_exc())
                record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
            else:
                logging.critical(traceback.format_exc())
                record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
        except Exception as e:
            logging.critical(traceback.format_exc())
            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)

        if once:
            break

    die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
Ejemplo n.º 3
0
def re_evaluator(once=False, process=0, total_processes=1, thread=0, threads_per_process=1):
    """
    Main loop to check the re-evaluation of dids.
    """

    logging.info('re_evaluator: starting')

    logging.info('re_evaluator: started')

    paused_dids = {}  # {(scope, name): datetime}

    while not graceful_stop.is_set():
        try:
            # Select a bunch of dids for re evaluation for this worker
            start = time.time()  # NOQA
            dids = get_updated_dids(total_workers=total_processes*threads_per_process-1,
                                    worker_number=process*threads_per_process+thread,
                                    limit=1000)
            logging.debug('Re-Evaluation index query time %f fetch size is %d' % (time.time() - start, len(dids)))

            # Refresh paused dids
            iter_paused_dids = deepcopy(paused_dids)
            for key in iter_paused_dids:
                if datetime.utcnow() > paused_dids[key]:
                    del paused_dids[key]

            # Remove paused dids from result set
            dids = [did for did in dids if (did.scope, did.name) not in paused_dids]

            # If the list is empty, sent the worker to sleep
            if not dids and not once:
                logging.info('re_evaluator[%s/%s] did not get any work' % (process*threads_per_process+thread, total_processes*threads_per_process-1))
                time.sleep(10)
            else:
                record_gauge('rule.judge.re_evaluate.threads.%d' % (process*threads_per_process+thread), 1)

                done_dids = {}
                for did in dids:
                    if graceful_stop.is_set():
                        break

                    # Try to delete all duplicate dids
                    # delete_duplicate_updated_dids(scope=did.scope, name=did.name, rule_evaluation_action=did.rule_evaluation_action, id=did.id)

                    # Check if this did has already been operated on
                    if '%s:%s' % (did.scope, did.name) in done_dids:
                        if did.rule_evaluation_action in done_dids['%s:%s' % (did.scope, did.name)]:
                            continue
                    else:
                        done_dids['%s:%s' % (did.scope, did.name)] = []
                    done_dids['%s:%s' % (did.scope, did.name)].append(did.rule_evaluation_action)

                    try:
                        start_time = time.time()
                        re_evaluate_did(scope=did.scope, name=did.name, rule_evaluation_action=did.rule_evaluation_action)
                        logging.debug('re_evaluator[%s/%s]: evaluation of %s:%s took %f' % (process*threads_per_process+thread, total_processes*threads_per_process-1, did.scope, did.name, time.time() - start_time))
                        delete_updated_did(id=did.id)
                    except DataIdentifierNotFound, e:
                        delete_updated_did(id=did.id)
                    except (DatabaseException, DatabaseError), e:
                        if isinstance(e.args[0], tuple):
                            if match('.*ORA-00054.*', e.args[0][0]):
                                paused_dids[(did.scope, did.name)] = datetime.utcnow() + timedelta(seconds=randint(60, 600))
                                logging.warning('re_evaluator[%s/%s]: Locks detected for %s:%s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, did.scope, did.name))
                                record_counter('rule.judge.exceptions.LocksDetected')
                            else:
                                logging.error(traceback.format_exc())
                                record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        else:
                            logging.error(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                    except ReplicationRuleCreationTemporaryFailed, e:
                        record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        logging.warning('re_evaluator[%s/%s]: Replica Creation temporary failed, retrying later for %s:%s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, did.scope, did.name))