Exemple #1
0
def rule_cleaner(once=False):
    """
    Main loop to check for expired replication rules
    """

    hostname = socket.gethostname()
    pid = os.getpid()
    current_thread = threading.current_thread()

    paused_rules = {}  # {rule_id: datetime}

    # Make an initial heartbeat so that all judge-cleaners have the correct worker number on the next try
    live(executable='rucio-judge-cleaner', hostname=hostname, pid=pid, thread=current_thread)
    graceful_stop.wait(1)

    while not graceful_stop.is_set():
        try:
            # heartbeat
            heartbeat = live(executable='rucio-judge-cleaner', hostname=hostname, pid=pid, thread=current_thread)

            start = time.time()

            # Refresh paused rules
            iter_paused_rules = deepcopy(paused_rules)
            for key in iter_paused_rules:
                if datetime.utcnow() > paused_rules[key]:
                    del paused_rules[key]

            rules = get_expired_rules(total_workers=heartbeat['nr_threads'] - 1,
                                      worker_number=heartbeat['assign_thread'],
                                      limit=200,
                                      blacklisted_rules=[key for key in paused_rules])
            logging.debug('rule_cleaner[%s/%s] index query time %f fetch size is %d' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, time.time() - start, len(rules)))

            if not rules and not once:
                logging.debug('rule_cleaner[%s/%s] did not get any work (paused_rules=%s)' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, str(len(paused_rules))))
                graceful_stop.wait(60)
            else:
                for rule in rules:
                    rule_id = rule[0]
                    rule_expression = rule[1]
                    logging.info('rule_cleaner[%s/%s]: Deleting rule %s with expression %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id, rule_expression))
                    if graceful_stop.is_set():
                        break
                    try:
                        start = time.time()
                        delete_rule(rule_id=rule_id, nowait=True)
                        logging.debug('rule_cleaner[%s/%s]: deletion of %s took %f' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id, time.time() - start))
                    except (DatabaseException, DatabaseError, UnsupportedOperation), e:
                        if match('.*ORA-00054.*', str(e.args[0])):
                            paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(600, 2400))
                            record_counter('rule.judge.exceptions.LocksDetected')
                            logging.warning('rule_cleaner[%s/%s]: Locks detected for %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id))
                        elif match('.*QueuePool.*', str(e.args[0])):
                            logging.warning(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        elif match('.*ORA-03135.*', str(e.args[0])):
                            logging.warning(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        else:
                            logging.error(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                    except RuleNotFound, e:
                        pass
Exemple #2
0
def rule_cleaner(once=False, process=0, total_processes=1, thread=0, threads_per_process=1):
    """
    Main loop to check for expired replication rules
    """

    logging.info('rule_cleaner: starting')

    logging.info('rule_cleaner: started')

    paused_rules = {}  # {rule_id: datetime}

    while not graceful_stop.is_set():
        try:
            start = time.time()
            rules = get_expired_rules(total_workers=total_processes*threads_per_process-1,
                                      worker_number=process*threads_per_process+thread,
                                      limit=1000)
            logging.debug('rule_cleaner index query time %f fetch size is %d' % (time.time() - start, len(rules)))

            # Refresh paused rules
            iter_paused_rules = deepcopy(paused_rules)
            for key in iter_paused_rules:
                if datetime.utcnow() > paused_rules[key]:
                    del paused_rules[key]

            # Remove paused rules from result set
            rules = [rule for rule in rules if rule[0] not in paused_rules]

            if not rules and not once:
                logging.info('rule_cleaner[%s/%s] did not get any work' % (process*threads_per_process+thread, total_processes*threads_per_process-1))
                time.sleep(10)
            else:
                record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 1)
                for rule in rules:
                    rule_id = rule[0]
                    rule_expression = rule[1]
                    logging.info('rule_cleaner[%s/%s]: Deleting rule %s with expression %s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id, rule_expression))
                    if graceful_stop.is_set():
                        break
                    try:
                        start = time.time()
                        delete_rule(rule_id=rule_id, nowait=True)
                        logging.debug('rule_cleaner[%s/%s]: deletion of %s took %f' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id, time.time() - start))
                    except (DatabaseException, DatabaseError, AccessDenied), e:
                        if isinstance(e.args[0], tuple):
                            if match('.*ORA-00054.*', e.args[0][0]):
                                paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600))
                                record_counter('rule.judge.exceptions.LocksDetected')
                                logging.warning('rule_cleaner[%s/%s]: Locks detected for %s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id))
                            else:
                                logging.error(traceback.format_exc())
                                record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        else:
                            logging.error(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 0)
        except Exception, e:
            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
            record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 0)
            logging.critical(traceback.format_exc())
        if once:
            return
Exemple #3
0
def rule_cleaner(once=False):
    """
    Main loop to check for expired replication rules
    """

    hostname = socket.gethostname()
    pid = os.getpid()
    current_thread = threading.current_thread()

    paused_rules = {}  # {rule_id: datetime}

    # Make an initial heartbeat so that all judge-cleaners have the correct worker number on the next try
    executable = 'judge-cleaner'
    heartbeat = live(executable=executable,
                     hostname=hostname,
                     pid=pid,
                     thread=current_thread)
    prefix = 'judge-cleaner[%i/%i] ' % (heartbeat['assign_thread'],
                                        heartbeat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    graceful_stop.wait(1)

    while not graceful_stop.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=current_thread)
            prefix = 'judge-cleaner[%i/%i] ' % (heartbeat['assign_thread'],
                                                heartbeat['nr_threads'])
            logger = formatted_logger(logging.log, prefix + '%s')

            start = time.time()

            # Refresh paused rules
            iter_paused_rules = deepcopy(paused_rules)
            for key in iter_paused_rules:
                if datetime.utcnow() > paused_rules[key]:
                    del paused_rules[key]

            rules = get_expired_rules(
                total_workers=heartbeat['nr_threads'],
                worker_number=heartbeat['assign_thread'],
                limit=200,
                blocked_rules=[key for key in paused_rules])
            logger(
                logging.DEBUG, 'index query time %f fetch size is %d' %
                (time.time() - start, len(rules)))

            if not rules and not once:
                logger(
                    logging.DEBUG, 'did not get any work (paused_rules=%s)' %
                    str(len(paused_rules)))
                graceful_stop.wait(60)
            else:
                for rule in rules:
                    rule_id = rule[0]
                    rule_expression = rule[1]
                    logger(
                        logging.INFO, 'Deleting rule %s with expression %s' %
                        (rule_id, rule_expression))
                    if graceful_stop.is_set():
                        break
                    try:
                        start = time.time()
                        delete_rule(rule_id=rule_id, nowait=True)
                        logger(
                            logging.DEBUG, 'deletion of %s took %f' %
                            (rule_id, time.time() - start))
                    except (DatabaseException, DatabaseError,
                            UnsupportedOperation) as e:
                        if match('.*ORA-00054.*', str(e.args[0])):
                            paused_rules[rule_id] = datetime.utcnow(
                            ) + timedelta(seconds=randint(600, 2400))
                            record_counter(
                                'rule.judge.exceptions.LocksDetected')
                            logger(logging.WARNING,
                                   'Locks detected for %s' % rule_id)
                        elif match('.*QueuePool.*', str(e.args[0])):
                            logger(logging.WARNING,
                                   'DatabaseException',
                                   exc_info=True)
                            record_counter('rule.judge.exceptions.%s' %
                                           e.__class__.__name__)
                        elif match('.*ORA-03135.*', str(e.args[0])):
                            logger(logging.WARNING,
                                   'DatabaseException',
                                   exc_info=True)
                            record_counter('rule.judge.exceptions.%s' %
                                           e.__class__.__name__)
                        else:
                            logger(logging.ERROR,
                                   'DatabaseException',
                                   exc_info=True)
                            record_counter('rule.judge.exceptions.%s' %
                                           e.__class__.__name__)
                    except RuleNotFound:
                        pass
        except (DatabaseException, DatabaseError) as e:
            if match('.*QueuePool.*', str(e.args[0])):
                logger(logging.WARNING, 'DatabaseException', exc_info=True)
                record_counter('rule.judge.exceptions.%s' %
                               e.__class__.__name__)
            elif match('.*ORA-03135.*', str(e.args[0])):
                logger(logging.WARNING, 'DatabaseException', exc_info=True)
                record_counter('rule.judge.exceptions.%s' %
                               e.__class__.__name__)
            else:
                logger(logging.CRITICAL, 'DatabaseException', exc_info=True)
                record_counter('rule.judge.exceptions.%s' %
                               e.__class__.__name__)
        except Exception as e:
            logger(logging.CRITICAL, 'DatabaseException', exc_info=True)
            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
        if once:
            break

    die(executable=executable,
        hostname=hostname,
        pid=pid,
        thread=current_thread)
Exemple #4
0
def run_once(paused_rules, heartbeat_handler, **_kwargs):
    worker_number, total_workers, logger = heartbeat_handler.live()

    try:
        start = time.time()

        # Refresh paused rules
        iter_paused_rules = deepcopy(paused_rules)
        for key in iter_paused_rules:
            if datetime.utcnow() > paused_rules[key]:
                del paused_rules[key]

        rules = get_expired_rules(total_workers=total_workers,
                                  worker_number=worker_number,
                                  limit=200,
                                  blocked_rules=[key for key in paused_rules])
        logger(
            logging.DEBUG, 'index query time %f fetch size is %d' %
            (time.time() - start, len(rules)))

        if not rules:
            logger(
                logging.DEBUG, 'did not get any work (paused_rules=%s)' %
                str(len(paused_rules)))
            return

        for rule in rules:
            _, _, logger = heartbeat_handler.live()
            rule_id = rule[0]
            rule_expression = rule[1]
            logger(
                logging.INFO, 'Deleting rule %s with expression %s' %
                (rule_id, rule_expression))
            if graceful_stop.is_set():
                break
            try:
                start = time.time()
                delete_rule(rule_id=rule_id, nowait=True)
                logger(
                    logging.DEBUG,
                    'deletion of %s took %f' % (rule_id, time.time() - start))
            except (DatabaseException, DatabaseError,
                    UnsupportedOperation) as e:
                if match('.*ORA-00054.*', str(e.args[0])):
                    paused_rules[rule_id] = datetime.utcnow() + timedelta(
                        seconds=randint(600, 2400))
                    record_counter('rule.judge.exceptions.{exception}',
                                   labels={'exception': 'LocksDetected'})
                    logger(logging.WARNING, 'Locks detected for %s' % rule_id)
                elif match('.*QueuePool.*', str(e.args[0])):
                    logger(logging.WARNING, 'DatabaseException', exc_info=True)
                    record_counter('rule.judge.exceptions.{exception}',
                                   labels={'exception': e.__class__.__name__})
                elif match('.*ORA-03135.*', str(e.args[0])):
                    logger(logging.WARNING, 'DatabaseException', exc_info=True)
                    record_counter('rule.judge.exceptions.{exception}',
                                   labels={'exception': e.__class__.__name__})
                else:
                    logger(logging.ERROR, 'DatabaseException', exc_info=True)
                    record_counter('rule.judge.exceptions.{exception}',
                                   labels={'exception': e.__class__.__name__})
            except RuleNotFound:
                pass
    except (DatabaseException, DatabaseError) as e:
        if match('.*QueuePool.*', str(e.args[0])):
            logger(logging.WARNING, 'DatabaseException', exc_info=True)
            record_counter('rule.judge.exceptions.{exception}',
                           labels={'exception': e.__class__.__name__})
        elif match('.*ORA-03135.*', str(e.args[0])):
            logger(logging.WARNING, 'DatabaseException', exc_info=True)
            record_counter('rule.judge.exceptions.{exception}',
                           labels={'exception': e.__class__.__name__})
        else:
            logger(logging.CRITICAL, 'DatabaseException', exc_info=True)
            record_counter('rule.judge.exceptions.{exception}',
                           labels={'exception': e.__class__.__name__})
    except Exception as e:
        logger(logging.CRITICAL, 'DatabaseException', exc_info=True)
        record_counter('rule.judge.exceptions.{exception}',
                       labels={'exception': e.__class__.__name__})